1perf_event_open(2)            System Calls Manual           perf_event_open(2)
2
3
4

NAME

6       perf_event_open - set up performance monitoring
7

LIBRARY

9       Standard C library (libc, -lc)
10

SYNOPSIS

12       #include <linux/perf_event.h>    /* Definition of PERF_* constants */
13       #include <linux/hw_breakpoint.h> /* Definition of HW_* constants */
14       #include <sys/syscall.h>         /* Definition of SYS_* constants */
15       #include <unistd.h>
16
17       int syscall(SYS_perf_event_open, struct perf_event_attr *attr,
18                   pid_t pid, int cpu, int group_fd, unsigned long flags);
19
20       Note:  glibc  provides  no wrapper for perf_event_open(), necessitating
21       the use of syscall(2).
22

DESCRIPTION

24       Given a list of parameters, perf_event_open() returns a  file  descrip‐
25       tor,  for  use  in subsequent system calls (read(2), mmap(2), prctl(2),
26       fcntl(2), etc.).
27
28       A call to perf_event_open() creates a file descriptor that allows  mea‐
29       suring  performance  information.   Each file descriptor corresponds to
30       one event that is measured; these can be grouped  together  to  measure
31       multiple events simultaneously.
32
33       Events  can  be  enabled and disabled in two ways: via ioctl(2) and via
34       prctl(2).  When an event is disabled it  does  not  count  or  generate
35       overflows but does continue to exist and maintain its count value.
36
37       Events  come in two flavors: counting and sampled.  A counting event is
38       one that is used for counting the aggregate number of events  that  oc‐
39       cur.   In  general,  counting event results are gathered with a read(2)
40       call.  A sampling event periodically writes measurements  to  a  buffer
41       that can then be accessed via mmap(2).
42
43   Arguments
44       The  pid  and  cpu  arguments allow specifying which process and CPU to
45       monitor:
46
47       pid == 0 and cpu == -1
48              This measures the calling process/thread on any CPU.
49
50       pid == 0 and cpu >= 0
51              This measures the calling process/thread only  when  running  on
52              the specified CPU.
53
54       pid > 0 and cpu == -1
55              This measures the specified process/thread on any CPU.
56
57       pid > 0 and cpu >= 0
58              This  measures the specified process/thread only when running on
59              the specified CPU.
60
61       pid == -1 and cpu >= 0
62              This measures all processes/threads on the specified CPU.   This
63              requires CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN capabil‐
64              ity or a /proc/sys/kernel/perf_event_paranoid value of less than
65              1.
66
67       pid == -1 and cpu == -1
68              This setting is invalid and will return an error.
69
70       When  pid  is greater than zero, permission to perform this system call
71       is governed by CAP_PERFMON (since Linux 5.9) and a ptrace  access  mode
72       PTRACE_MODE_READ_REALCREDS   check   on   older   Linux  versions;  see
73       ptrace(2).
74
75       The group_fd argument allows event groups  to  be  created.   An  event
76       group  has  one event which is the group leader.  The leader is created
77       first, with group_fd = -1.  The rest of the group members  are  created
78       with  subsequent perf_event_open() calls with group_fd being set to the
79       file descriptor of the group leader.  (A single event  on  its  own  is
80       created  with group_fd = -1 and is considered to be a group with only 1
81       member.)  An event group is scheduled onto the CPU as a unit:  it  will
82       be  put  onto the CPU only if all of the events in the group can be put
83       onto the CPU.  This means that the values of the member events  can  be
84       meaningfully  compared —added, divided (to get ratios), and so on— with
85       each other, since they have counted events for the same set of executed
86       instructions.
87
88       The flags argument is formed by ORing together zero or more of the fol‐
89       lowing values:
90
91       PERF_FLAG_FD_CLOEXEC (since Linux 3.14)
92              This flag enables the close-on-exec flag for the  created  event
93              file  descriptor,  so  that the file descriptor is automatically
94              closed on execve(2).  Setting the close-on-exec  flags  at  cre‐
95              ation  time,  rather  than later with fcntl(2), avoids potential
96              race   conditions   where    the    calling    thread    invokes
97              perf_event_open()  and  fcntl(2)  at  the  same  time as another
98              thread calls fork(2) then execve(2).
99
100       PERF_FLAG_FD_NO_GROUP
101              This flag tells the event to ignore the group_fd  parameter  ex‐
102              cept  for the purpose of setting up output redirection using the
103              PERF_FLAG_FD_OUTPUT flag.
104
105       PERF_FLAG_FD_OUTPUT (broken since Linux 2.6.35)
106              This flag re-routes the event's sampled output to instead be in‐
107              cluded in the mmap buffer of the event specified by group_fd.
108
109       PERF_FLAG_PID_CGROUP (since Linux 2.6.39)
110              This  flag  activates  per-container  system-wide monitoring.  A
111              container is an abstraction that isolates a set of resources for
112              finer-grained  control  (CPUs, memory, etc.).  In this mode, the
113              event is measured only if the thread running  on  the  monitored
114              CPU belongs to the designated container (cgroup).  The cgroup is
115              identified by passing a file descriptor opened on its  directory
116              in the cgroupfs filesystem.  For instance, if the cgroup to mon‐
117              itor  is  called  test,  then  a  file  descriptor   opened   on
118              /dev/cgroup/test  (assuming  cgroupfs is mounted on /dev/cgroup)
119              must be passed as  the  pid  parameter.   cgroup  monitoring  is
120              available  only for system-wide events and may therefore require
121              extra permissions.
122
123       The perf_event_attr structure provides detailed configuration  informa‐
124       tion for the event being created.
125
126           struct perf_event_attr {
127               __u32 type;                 /* Type of event */
128               __u32 size;                 /* Size of attribute structure */
129               __u64 config;               /* Type-specific configuration */
130
131               union {
132                   __u64 sample_period;    /* Period of sampling */
133                   __u64 sample_freq;      /* Frequency of sampling */
134               };
135
136               __u64 sample_type;  /* Specifies values included in sample */
137               __u64 read_format;  /* Specifies values returned in read */
138
139               __u64 disabled       : 1,   /* off by default */
140                     inherit        : 1,   /* children inherit it */
141                     pinned         : 1,   /* must always be on PMU */
142                     exclusive      : 1,   /* only group on PMU */
143                     exclude_user   : 1,   /* don't count user */
144                     exclude_kernel : 1,   /* don't count kernel */
145                     exclude_hv     : 1,   /* don't count hypervisor */
146                     exclude_idle   : 1,   /* don't count when idle */
147                     mmap           : 1,   /* include mmap data */
148                     comm           : 1,   /* include comm data */
149                     freq           : 1,   /* use freq, not period */
150                     inherit_stat   : 1,   /* per task counts */
151                     enable_on_exec : 1,   /* next exec enables */
152                     task           : 1,   /* trace fork/exit */
153                     watermark      : 1,   /* wakeup_watermark */
154                     precise_ip     : 2,   /* skid constraint */
155                     mmap_data      : 1,   /* non-exec mmap data */
156                     sample_id_all  : 1,   /* sample_type all events */
157                     exclude_host   : 1,   /* don't count in host */
158                     exclude_guest  : 1,   /* don't count in guest */
159                     exclude_callchain_kernel : 1,
160                                           /* exclude kernel callchains */
161                     exclude_callchain_user   : 1,
162                                           /* exclude user callchains */
163                     mmap2          :  1,  /* include mmap with inode data */
164                     comm_exec      :  1,  /* flag comm events that are
165                                              due to exec */
166                     use_clockid    :  1,  /* use clockid for time fields */
167                     context_switch :  1,  /* context switch data */
168                     write_backward :  1,  /* Write ring buffer from end
169                                              to beginning */
170                     namespaces     :  1,  /* include namespaces data */
171                     ksymbol        :  1,  /* include ksymbol events */
172                     bpf_event      :  1,  /* include bpf events */
173                     aux_output     :  1,  /* generate AUX records
174                                              instead of events */
175                     cgroup         :  1,  /* include cgroup events */
176                     text_poke      :  1,  /* include text poke events */
177                     build_id       :  1,  /* use build id in mmap2 events */
178                     inherit_thread :  1,  /* children only inherit */
179                                           /* if cloned with CLONE_THREAD */
180                     remove_on_exec :  1,  /* event is removed from task
181                                              on exec */
182                     sigtrap        :  1,  /* send synchronous SIGTRAP
183                                              on event */
184
185                     __reserved_1   : 26;
186
187               union {
188                   __u32 wakeup_events;    /* wakeup every n events */
189                   __u32 wakeup_watermark; /* bytes before wakeup */
190               };
191
192               __u32     bp_type;          /* breakpoint type */
193
194               union {
195                   __u64 bp_addr;          /* breakpoint address */
196                   __u64 kprobe_func;      /* for perf_kprobe */
197                   __u64 uprobe_path;      /* for perf_uprobe */
198                   __u64 config1;          /* extension of config */
199               };
200
201               union {
202                   __u64 bp_len;           /* breakpoint length */
203                   __u64 kprobe_addr;      /* with kprobe_func == NULL */
204                   __u64 probe_offset;     /* for perf_[k,u]probe */
205                   __u64 config2;          /* extension of config1 */
206               };
207               __u64 branch_sample_type;   /* enum perf_branch_sample_type */
208               __u64 sample_regs_user;     /* user regs to dump on samples */
209               __u32 sample_stack_user;    /* size of stack to dump on
210                                              samples */
211               __s32 clockid;              /* clock to use for time fields */
212               __u64 sample_regs_intr;     /* regs to dump on samples */
213               __u32 aux_watermark;        /* aux bytes before wakeup */
214               __u16 sample_max_stack;     /* max frames in callchain */
215               __u16 __reserved_2;         /* align to u64 */
216               __u32 aux_sample_size;      /* max aux sample size */
217               __u32 __reserved_3;         /* align to u64 */
218               __u64 sig_data;             /* user data for sigtrap */
219
220           };
221
222       The  fields  of the perf_event_attr structure are described in more de‐
223       tail below:
224
225       type   This field specifies the overall event type.  It has one of  the
226              following values:
227
228              PERF_TYPE_HARDWARE
229                     This  indicates  one of the "generalized" hardware events
230                     provided by the kernel.  See the config field  definition
231                     for more details.
232
233              PERF_TYPE_SOFTWARE
234                     This  indicates  one  of the software-defined events pro‐
235                     vided by the kernel  (even  if  no  hardware  support  is
236                     available).
237
238              PERF_TYPE_TRACEPOINT
239                     This indicates a tracepoint provided by the kernel trace‐
240                     point infrastructure.
241
242              PERF_TYPE_HW_CACHE
243                     This indicates a hardware cache event.  This has  a  spe‐
244                     cial encoding, described in the config field definition.
245
246              PERF_TYPE_RAW
247                     This  indicates  a "raw" implementation-specific event in
248                     the config field.
249
250              PERF_TYPE_BREAKPOINT (since Linux 2.6.33)
251                     This indicates a hardware breakpoint as provided  by  the
252                     CPU.   Breakpoints  can  be read/write accesses to an ad‐
253                     dress as well as execution of an instruction address.
254
255              dynamic PMU
256                     Since Linux 2.6.38, perf_event_open() can support  multi‐
257                     ple PMUs.  To enable this, a value exported by the kernel
258                     can be used in the type field to indicate  which  PMU  to
259                     use.  The value to use can be found in the sysfs filesys‐
260                     tem: there is  a  subdirectory  per  PMU  instance  under
261                     /sys/bus/event_source/devices.    In   each  subdirectory
262                     there is a type file whose content is an integer that can
263                     be    used    in   the   type   field.    For   instance,
264                     /sys/bus/event_source/devices/cpu/type contains the value
265                     for the core CPU PMU, which is usually 4.
266
267              kprobe and uprobe (since Linux 4.17)
268                     These  two dynamic PMUs create a kprobe/uprobe and attach
269                     it to the file descriptor generated  by  perf_event_open.
270                     The kprobe/uprobe will be destroyed on the destruction of
271                     the  file  descriptor.   See  fields   kprobe_func,   up‐
272                     robe_path,  kprobe_addr,  and  probe_offset  for more de‐
273                     tails.
274
275       size   The size of the perf_event_attr structure  for  forward/backward
276              compatibility.  Set this using sizeof(struct perf_event_attr) to
277              allow the kernel to see the struct size at the time of  compila‐
278              tion.
279
280              The  related  define  PERF_ATTR_SIZE_VER0 is set to 64; this was
281              the size of the first published struct.  PERF_ATTR_SIZE_VER1  is
282              72,  corresponding  to  the  addition  of  breakpoints  in Linux
283              2.6.33.  PERF_ATTR_SIZE_VER2 is 80 corresponding to the addition
284              of branch sampling in Linux 3.4.  PERF_ATTR_SIZE_VER3 is 96 cor‐
285              responding  to  the  addition  of  sample_regs_user   and   sam‐
286              ple_stack_user  in Linux 3.7.  PERF_ATTR_SIZE_VER4 is 104 corre‐
287              sponding to the addition  of  sample_regs_intr  in  Linux  3.19.
288              PERF_ATTR_SIZE_VER5  is  112  corresponding  to  the addition of
289              aux_watermark in Linux 4.1.
290
291       config This specifies which event you want,  in  conjunction  with  the
292              type  field.  The config1 and config2 fields are also taken into
293              account in cases where 64 bits is not enough  to  fully  specify
294              the event.  The encoding of these fields are event dependent.
295
296              There  are  various ways to set the config field that are depen‐
297              dent on the value of the previously described type field.   What
298              follows  are  various possible settings for config separated out
299              by type.
300
301              If type is PERF_TYPE_HARDWARE, we are measuring one of the  gen‐
302              eralized hardware CPU events.  Not all of these are available on
303              all platforms.  Set config to one of the following:
304
305                   PERF_COUNT_HW_CPU_CYCLES
306                          Total cycles.  Be wary of what  happens  during  CPU
307                          frequency scaling.
308
309                   PERF_COUNT_HW_INSTRUCTIONS
310                          Retired  instructions.  Be careful, these can be af‐
311                          fected by various issues, most notably hardware  in‐
312                          terrupt counts.
313
314                   PERF_COUNT_HW_CACHE_REFERENCES
315                          Cache  accesses.   Usually this indicates Last Level
316                          Cache accesses but this may vary depending  on  your
317                          CPU.  This may include prefetches and coherency mes‐
318                          sages; again this depends on the design of your CPU.
319
320                   PERF_COUNT_HW_CACHE_MISSES
321                          Cache misses.  Usually  this  indicates  Last  Level
322                          Cache  misses;  this  is intended to be used in con‐
323                          junction  with  the   PERF_COUNT_HW_CACHE_REFERENCES
324                          event to calculate cache miss rates.
325
326                   PERF_COUNT_HW_BRANCH_INSTRUCTIONS
327                          Retired branch instructions.  Prior to Linux 2.6.35,
328                          this used the wrong event on AMD processors.
329
330                   PERF_COUNT_HW_BRANCH_MISSES
331                          Mispredicted branch instructions.
332
333                   PERF_COUNT_HW_BUS_CYCLES
334                          Bus cycles, which can be different  from  total  cy‐
335                          cles.
336
337                   PERF_COUNT_HW_STALLED_CYCLES_FRONTEND (since Linux 3.0)
338                          Stalled cycles during issue.
339
340                   PERF_COUNT_HW_STALLED_CYCLES_BACKEND (since Linux 3.0)
341                          Stalled cycles during retirement.
342
343                   PERF_COUNT_HW_REF_CPU_CYCLES (since Linux 3.3)
344                          Total cycles; not affected by CPU frequency scaling.
345
346              If  type is PERF_TYPE_SOFTWARE, we are measuring software events
347              provided by the kernel.  Set config to one of the following:
348
349                   PERF_COUNT_SW_CPU_CLOCK
350                          This reports the CPU clock, a  high-resolution  per-
351                          CPU timer.
352
353                   PERF_COUNT_SW_TASK_CLOCK
354                          This reports a clock count specific to the task that
355                          is running.
356
357                   PERF_COUNT_SW_PAGE_FAULTS
358                          This reports the number of page faults.
359
360                   PERF_COUNT_SW_CONTEXT_SWITCHES
361                          This counts context switches.  Until  Linux  2.6.34,
362                          these  were all reported as user-space events, after
363                          that they are reported as happening in the kernel.
364
365                   PERF_COUNT_SW_CPU_MIGRATIONS
366                          This reports the number of times the process has mi‐
367                          grated to a new CPU.
368
369                   PERF_COUNT_SW_PAGE_FAULTS_MIN
370                          This  counts the number of minor page faults.  These
371                          did not require disk I/O to handle.
372
373                   PERF_COUNT_SW_PAGE_FAULTS_MAJ
374                          This counts the number of major page faults.   These
375                          required disk I/O to handle.
376
377                   PERF_COUNT_SW_ALIGNMENT_FAULTS (since Linux 2.6.33)
378                          This  counts  the number of alignment faults.  These
379                          happen when unaligned memory  accesses  happen;  the
380                          kernel  can handle these but it reduces performance.
381                          This happens only on some  architectures  (never  on
382                          x86).
383
384                   PERF_COUNT_SW_EMULATION_FAULTS (since Linux 2.6.33)
385                          This  counts  the  number  of emulation faults.  The
386                          kernel sometimes traps on unimplemented instructions
387                          and  emulates  them  for user space.  This can nega‐
388                          tively impact performance.
389
390                   PERF_COUNT_SW_DUMMY (since Linux 3.12)
391                          This is a placeholder  event  that  counts  nothing.
392                          Informational  sample  record  types such as mmap or
393                          comm must be associated with an active event.   This
394                          dummy  event  allows  gathering such records without
395                          requiring a counting event.
396
397                   PERF_COUNT_SW_BPF_OUTPUT (since Linux 4.4)
398                          This is used to generate raw sample data  from  BPF.
399                          BPF   programs   can   write  to  this  event  using
400                          bpf_perf_event_output helper.
401
402                   PERF_COUNT_SW_CGROUP_SWITCHES (since Linux 5.13)
403                          This counts context switches to a task in a  differ‐
404                          ent  cgroup.  In other words, if the next task is in
405                          the same cgroup, it won't count the switch.
406
407              If type is PERF_TYPE_TRACEPOINT, then we  are  measuring  kernel
408              tracepoints.   The  value  to use in config can be obtained from
409              under debugfs tracing/events/*/*/id if ftrace is enabled in  the
410              kernel.
411
412              If  type is PERF_TYPE_HW_CACHE, then we are measuring a hardware
413              CPU cache event.  To calculate the appropriate config value, use
414              the following equation:
415
416                      config = (perf_hw_cache_id) |
417                               (perf_hw_cache_op_id << 8) |
418                               (perf_hw_cache_op_result_id << 16);
419
420                  where perf_hw_cache_id is one of:
421
422                      PERF_COUNT_HW_CACHE_L1D
423                             for measuring Level 1 Data Cache
424
425                      PERF_COUNT_HW_CACHE_L1I
426                             for measuring Level 1 Instruction Cache
427
428                      PERF_COUNT_HW_CACHE_LL
429                             for measuring Last-Level Cache
430
431                      PERF_COUNT_HW_CACHE_DTLB
432                             for measuring the Data TLB
433
434                      PERF_COUNT_HW_CACHE_ITLB
435                             for measuring the Instruction TLB
436
437                      PERF_COUNT_HW_CACHE_BPU
438                             for measuring the branch prediction unit
439
440                      PERF_COUNT_HW_CACHE_NODE (since Linux 3.1)
441                             for measuring local memory accesses
442
443                  and perf_hw_cache_op_id is one of:
444
445                      PERF_COUNT_HW_CACHE_OP_READ
446                             for read accesses
447
448                      PERF_COUNT_HW_CACHE_OP_WRITE
449                             for write accesses
450
451                      PERF_COUNT_HW_CACHE_OP_PREFETCH
452                             for prefetch accesses
453
454                  and perf_hw_cache_op_result_id is one of:
455
456                      PERF_COUNT_HW_CACHE_RESULT_ACCESS
457                             to measure accesses
458
459                      PERF_COUNT_HW_CACHE_RESULT_MISS
460                             to measure misses
461
462              If  type  is  PERF_TYPE_RAW, then a custom "raw" config value is
463              needed.  Most CPUs support events that are not  covered  by  the
464              "generalized"  events.   These  are  implementation defined; see
465              your CPU manual (for example the Intel Volume  3B  documentation
466              or  the  AMD  BIOS and Kernel Developer Guide).  The libpfm4 li‐
467              brary can be used to translate from the name  in  the  architec‐
468              tural  manuals to the raw hex value perf_event_open() expects in
469              this field.
470
471              If type is PERF_TYPE_BREAKPOINT, then leave config set to  zero.
472              Its parameters are set in other places.
473
474              If  type is kprobe or uprobe, set retprobe (bit 0 of config, see
475              /sys/bus/event_source/devices/[k,u]probe/format/retprobe)    for
476              kretprobe/uretprobe.    See   fields  kprobe_func,  uprobe_path,
477              kprobe_addr, and probe_offset for more details.
478
479       kprobe_func, uprobe_path, kprobe_addr, and probe_offset
480              These fields describe the kprobe/uprobe for dynamic PMUs  kprobe
481              and  uprobe.   For  kprobe: use kprobe_func and probe_offset, or
482              use kprobe_addr and leave kprobe_func as NULL.  For uprobe:  use
483              uprobe_path and probe_offset.
484
485       sample_period, sample_freq
486              A  "sampling"  event is one that generates an overflow notifica‐
487              tion every N events, where N is given by sample_period.  A  sam‐
488              pling event has sample_period > 0.  When an overflow occurs, re‐
489              quested data is recorded in the mmap  buffer.   The  sample_type
490              field controls what data is recorded on each overflow.
491
492              sample_freq can be used if you wish to use frequency rather than
493              period.  In this case, you set the freq flag.  The  kernel  will
494              adjust  the sampling period to try and achieve the desired rate.
495              The rate of adjustment is a timer tick.
496
497       sample_type
498              The various bits in this field specify which values  to  include
499              in the sample.  They will be recorded in a ring-buffer, which is
500              available to user space using mmap(2).  The order in  which  the
501              values are saved in the sample are documented in the MMAP Layout
502              subsection below; it is not  the  enum  perf_event_sample_format
503              order.
504
505              PERF_SAMPLE_IP
506                     Records instruction pointer.
507
508              PERF_SAMPLE_TID
509                     Records the process and thread IDs.
510
511              PERF_SAMPLE_TIME
512                     Records a timestamp.
513
514              PERF_SAMPLE_ADDR
515                     Records an address, if applicable.
516
517              PERF_SAMPLE_READ
518                     Record counter values for all events in a group, not just
519                     the group leader.
520
521              PERF_SAMPLE_CALLCHAIN
522                     Records the callchain (stack backtrace).
523
524              PERF_SAMPLE_ID
525                     Records a unique ID for the opened event's group leader.
526
527              PERF_SAMPLE_CPU
528                     Records CPU number.
529
530              PERF_SAMPLE_PERIOD
531                     Records the current sampling period.
532
533              PERF_SAMPLE_STREAM_ID
534                     Records  a  unique  ID  for  the  opened  event.   Unlike
535                     PERF_SAMPLE_ID  the  actual ID is returned, not the group
536                     leader.  This ID is the  same  as  the  one  returned  by
537                     PERF_FORMAT_ID.
538
539              PERF_SAMPLE_RAW
540                     Records additional data, if applicable.  Usually returned
541                     by tracepoint events.
542
543              PERF_SAMPLE_BRANCH_STACK (since Linux 3.4)
544                     This provides a record of recent branches, as provided by
545                     CPU  branch  sampling hardware (such as Intel Last Branch
546                     Record).  Not all hardware supports this feature.
547
548                     See the branch_sample_type field for how to filter  which
549                     branches are reported.
550
551              PERF_SAMPLE_REGS_USER (since Linux 3.7)
552                     Records  the  current  user-level CPU register state (the
553                     values in the process before the kernel was called).
554
555              PERF_SAMPLE_STACK_USER (since Linux 3.7)
556                     Records the user level stack, allowing stack unwinding.
557
558              PERF_SAMPLE_WEIGHT (since Linux 3.10)
559                     Records a hardware provided weight value  that  expresses
560                     how  costly the sampled event was.  This allows the hard‐
561                     ware to highlight expensive events in a profile.
562
563              PERF_SAMPLE_DATA_SRC (since Linux 3.10)
564                     Records the data source: where in  the  memory  hierarchy
565                     the  data  associated  with  the sampled instruction came
566                     from.  This is available only if the underlying  hardware
567                     supports this feature.
568
569              PERF_SAMPLE_IDENTIFIER (since Linux 3.12)
570                     Places  the  SAMPLE_ID  value  in a fixed position in the
571                     record, either at the beginning (for sample events) or at
572                     the end (if a non-sample event).
573
574                     This  was  necessary  because  a  sample  stream may have
575                     records from various different event sources with differ‐
576                     ent sample_type settings.  Parsing the event stream prop‐
577                     erly was not possible because the format  of  the  record
578                     was needed to find SAMPLE_ID, but the format could not be
579                     found without knowing what event the sample  belonged  to
580                     (causing a circular dependency).
581
582                     The PERF_SAMPLE_IDENTIFIER setting makes the event stream
583                     always parsable by putting SAMPLE_ID in a fixed location,
584                     even though it means having duplicate SAMPLE_ID values in
585                     records.
586
587              PERF_SAMPLE_TRANSACTION (since Linux 3.13)
588                     Records reasons for  transactional  memory  abort  events
589                     (for  example,  from  Intel TSX transactional memory sup‐
590                     port).
591
592                     The precise_ip setting must  be  greater  than  0  and  a
593                     transactional  memory  abort event must be measured or no
594                     values will be recorded.  Also note that some  perf_event
595                     measurements,  such  as sampled cycle counting, may cause
596                     extraneous aborts  (by  causing  an  interrupt  during  a
597                     transaction).
598
599              PERF_SAMPLE_REGS_INTR (since Linux 3.19)
600                     Records  a  subset  of  the current CPU register state as
601                     specified   by   sample_regs_intr.    Unlike    PERF_SAM‐
602                     PLE_REGS_USER the register values will return kernel reg‐
603                     ister state if the overflow happened while kernel code is
604                     running.  If the CPU supports hardware sampling of regis‐
605                     ter state (i.e., PEBS on Intel x86) and precise_ip is set
606                     higher  than  zero  then the register values returned are
607                     those captured by hardware at the time of the sampled in‐
608                     struction's retirement.
609
610              PERF_SAMPLE_PHYS_ADDR (since Linux 4.13)
611                     Records  physical  address  of  data  like  in  PERF_SAM‐
612                     PLE_ADDR.
613
614              PERF_SAMPLE_CGROUP (since Linux 5.7)
615                     Records (perf_event) cgroup ID of the process.  This cor‐
616                     responds to the id field in the PERF_RECORD_CGROUP event.
617
618              PERF_SAMPLE_DATA_PAGE_SIZE (since Linux 5.11)
619                     Records page size of data like in PERF_SAMPLE_ADDR.
620
621              PERF_SAMPLE_CODE_PAGE_SIZE (since Linux 5.11)
622                     Records page size of ip like in PERF_SAMPLE_IP.
623
624              PERF_SAMPLE_WEIGHT_STRUCT (since Linux 5.12)
625                     Records hardware provided weight values like in PERF_SAM‐
626                     PLE_WEIGHT, but it can represent  multiple  values  in  a
627                     struct.    This   shares  the  same  space  as  PERF_SAM‐
628                     PLE_WEIGHT, so users can apply either of those, not both.
629                     It has the following format and the meaning of each field
630                     is dependent on the hardware implementation.
631
632                  union perf_sample_weight {
633                      u64  full;           /* PERF_SAMPLE_WEIGHT */
634                      struct {             /* PERF_SAMPLE_WEIGHT_STRUCT */
635                          u32  var1_dw;
636                          u16  var2_w;
637                          u16  var3_w;
638                      };
639                  };
640
641       read_format
642              This field specifies the format of the data returned by  read(2)
643              on a perf_event_open() file descriptor.
644
645              PERF_FORMAT_TOTAL_TIME_ENABLED
646                     Adds  the 64-bit time_enabled field.  This can be used to
647                     calculate estimated totals if the  PMU  is  overcommitted
648                     and multiplexing is happening.
649
650              PERF_FORMAT_TOTAL_TIME_RUNNING
651                     Adds  the 64-bit time_running field.  This can be used to
652                     calculate estimated totals if the  PMU  is  overcommitted
653                     and multiplexing is happening.
654
655              PERF_FORMAT_ID
656                     Adds  a 64-bit unique value that corresponds to the event
657                     group.
658
659              PERF_FORMAT_GROUP
660                     Allows all counter values in an event group  to  be  read
661                     with one read.
662
663              PERF_FORMAT_LOST  (since Linux 6.0)
664                     Adds  a  64-bit  value that is the number of lost samples
665                     for this event.  This would be only meaningful when  sam‐
666                     ple_period or sample_freq is set.
667
668       disabled
669              The  disabled  bit specifies whether the counter starts out dis‐
670              abled or enabled.  If disabled, the event can later  be  enabled
671              by ioctl(2), prctl(2), or enable_on_exec.
672
673              When creating an event group, typically the group leader is ini‐
674              tialized with disabled set to 1 and any child  events  are  ini‐
675              tialized  with disabled set to 0.  Despite disabled being 0, the
676              child events will not start until the group leader is enabled.
677
678       inherit
679              The inherit bit specifies that this counter should count  events
680              of child tasks as well as the task specified.  This applies only
681              to new children, not to any existing children at  the  time  the
682              counter  is  created  (nor to any new children of existing chil‐
683              dren).
684
685              Inherit does not work for some combinations of read_format  val‐
686              ues, such as PERF_FORMAT_GROUP.
687
688       pinned The  pinned  bit  specifies that the counter should always be on
689              the CPU if at all possible.  It applies only to  hardware  coun‐
690              ters  and  only to group leaders.  If a pinned counter cannot be
691              put onto the CPU (e.g., because there are  not  enough  hardware
692              counters  or  because of a conflict with some other event), then
693              the counter goes into an 'error' state, where reads return  end-
694              of-file  (i.e.,  read(2)  returns 0) until the counter is subse‐
695              quently enabled or disabled.
696
697       exclusive
698              The exclusive bit specifies that when this counter's group is on
699              the  CPU,  it should be the only group using the CPU's counters.
700              In the future this may allow monitoring programs to support  PMU
701              features  that  need  to  run  alone so that they do not disrupt
702              other hardware counters.
703
704              Note that many unexpected situations may prevent events with the
705              exclusive  bit  set  from ever running.  This includes any users
706              running a system-wide measurement as well as any kernel  use  of
707              the  performance  counters  (including  the commonly enabled NMI
708              Watchdog Timer interface).
709
710       exclude_user
711              If this bit is set, the count excludes  events  that  happen  in
712              user space.
713
714       exclude_kernel
715              If  this  bit  is  set, the count excludes events that happen in
716              kernel space.
717
718       exclude_hv
719              If this bit is set, the count excludes events that happen in the
720              hypervisor.   This is mainly for PMUs that have built-in support
721              for handling this (such as POWER).  Extra support is needed  for
722              handling hypervisor measurements on most machines.
723
724       exclude_idle
725              If  set,  don't  count  when  the  CPU is running the idle task.
726              While you can currently enable this for any event  type,  it  is
727              ignored for all but software events.
728
729       mmap   The  mmap bit enables generation of PERF_RECORD_MMAP samples for
730              every mmap(2) call that has PROT_EXEC set.  This allows tools to
731              notice  new executable code being mapped into a program (dynamic
732              shared libraries for example) so that addresses  can  be  mapped
733              back to the original code.
734
735       comm   The  comm  bit enables tracking of process command name as modi‐
736              fied by the execve(2) and  prctl(PR_SET_NAME)  system  calls  as
737              well  as  writing  to /proc/self/comm.  If the comm_exec flag is
738              also successfully set (possible since Linux 3.16), then the misc
739              flag PERF_RECORD_MISC_COMM_EXEC can be used to differentiate the
740              execve(2) case from the others.
741
742       freq   If this bit is set, then sample_frequency not  sample_period  is
743              used when setting up the sampling interval.
744
745       inherit_stat
746              This  bit  enables  saving of event counts on context switch for
747              inherited tasks.  This is meaningful only if the  inherit  field
748              is set.
749
750       enable_on_exec
751              If  this  bit is set, a counter is automatically enabled after a
752              call to execve(2).
753
754       task   If this bit is set, then fork/exit notifications are included in
755              the ring buffer.
756
757       watermark
758              If  set,  have an overflow notification happen when we cross the
759              wakeup_watermark boundary.   Otherwise,  overflow  notifications
760              happen after wakeup_events samples.
761
762       precise_ip (since Linux 2.6.35)
763              This controls the amount of skid.  Skid is how many instructions
764              execute between an event of interest happening  and  the  kernel
765              being able to stop and record the event.  Smaller skid is better
766              and allows more accurate reporting of which events correspond to
767              which instructions, but hardware is often limited with how small
768              this can be.
769
770              The possible values of this field are the following:
771
772              0      SAMPLE_IP can have arbitrary skid.
773
774              1      SAMPLE_IP must have constant skid.
775
776              2      SAMPLE_IP requested to have 0 skid.
777
778              3      SAMPLE_IP must have 0 skid.  See also the description  of
779                     PERF_RECORD_MISC_EXACT_IP.
780
781       mmap_data (since Linux 2.6.36)
782              This is the counterpart of the mmap field.  This enables genera‐
783              tion of PERF_RECORD_MMAP samples for mmap(2) calls that  do  not
784              have PROT_EXEC set (for example data and SysV shared memory).
785
786       sample_id_all (since Linux 2.6.38)
787              If  set, then TID, TIME, ID, STREAM_ID, and CPU can additionally
788              be included in non-PERF_RECORD_SAMPLEs if the corresponding sam‐
789              ple_type is selected.
790
791              If  PERF_SAMPLE_IDENTIFIER  is  specified, then an additional ID
792              value is included as the last value to ease parsing  the  record
793              stream.  This may lead to the id value appearing twice.
794
795              The layout is described by this pseudo-structure:
796
797                  struct sample_id {
798                      { u32 pid, tid; }   /* if PERF_SAMPLE_TID set */
799                      { u64 time;     }   /* if PERF_SAMPLE_TIME set */
800                      { u64 id;       }   /* if PERF_SAMPLE_ID set */
801                      { u64 stream_id;}   /* if PERF_SAMPLE_STREAM_ID set  */
802                      { u32 cpu, res; }   /* if PERF_SAMPLE_CPU set */
803                      { u64 id;       }   /* if PERF_SAMPLE_IDENTIFIER set */
804                  };
805
806       exclude_host (since Linux 3.2)
807              When  conducting  measurements that include processes running VM
808              instances (i.e., have executed a KVM_RUN ioctl(2)), only measure
809              events happening inside a guest instance.  This is only meaning‐
810              ful outside the guests; this  setting  does  not  change  counts
811              gathered  inside  of  a guest.  Currently, this functionality is
812              x86 only.
813
814       exclude_guest (since Linux 3.2)
815              When conducting measurements that include processes  running  VM
816              instances  (i.e., have executed a KVM_RUN ioctl(2)), do not mea‐
817              sure events happening inside  guest  instances.   This  is  only
818              meaningful  outside  the  guests;  this  setting does not change
819              counts gathered inside of a guest.  Currently, this  functional‐
820              ity is x86 only.
821
822       exclude_callchain_kernel (since Linux 3.7)
823              Do not include kernel callchains.
824
825       exclude_callchain_user (since Linux 3.7)
826              Do not include user callchains.
827
828       mmap2 (since Linux 3.16)
829              Generate an extended executable mmap record that contains enough
830              additional information to  uniquely  identify  shared  mappings.
831              The mmap flag must also be set for this to work.
832
833       comm_exec (since Linux 3.16)
834              This is purely a feature-detection flag, it does not change ker‐
835              nel behavior.  If this flag can successfully be set, then,  when
836              comm is enabled, the PERF_RECORD_MISC_COMM_EXEC flag will be set
837              in the misc field of a comm record header if  the  rename  event
838              being  reported  was caused by a call to execve(2).  This allows
839              tools to distinguish between the various types of process renam‐
840              ing.
841
842       use_clockid (since Linux 4.1)
843              This  allows  selecting  which  internal Linux clock to use when
844              generating timestamps via the clockid field.  This can  make  it
845              easier  to correlate perf sample times with timestamps generated
846              by other tools.
847
848       context_switch (since Linux 4.3)
849              This enables the generation of PERF_RECORD_SWITCH records when a
850              context  switch  occurs.   It  also  enables  the  generation of
851              PERF_RECORD_SWITCH_CPU_WIDE records when  sampling  in  CPU-wide
852              mode.   This functionality is in addition to existing tracepoint
853              and software events for measuring context switches.  The  advan‐
854              tage  of  this method is that it will give full information even
855              with strict perf_event_paranoid settings.
856
857       write_backward (since Linux 4.6)
858              This causes the ring buffer to be written from the  end  to  the
859              beginning.   This  is  to support reading from overwritable ring
860              buffer.
861
862       namespaces (since Linux 4.11)
863              This enables the generation  of  PERF_RECORD_NAMESPACES  records
864              when a task enters a new namespace.  Each namespace has a combi‐
865              nation of device and inode numbers.
866
867       ksymbol (since Linux 5.0)
868              This enables the generation of PERF_RECORD_KSYMBOL records  when
869              new kernel symbols are registered or unregistered.  This is ana‐
870              lyzing dynamic kernel functions like eBPF.
871
872       bpf_event (since Linux 5.0)
873              This enables the  generation  of  PERF_RECORD_BPF_EVENT  records
874              when an eBPF program is loaded or unloaded.
875
876       aux_output (since Linux 5.4)
877              This  allows  normal  (non-AUX)  events to generate data for AUX
878              events if the hardware supports it.
879
880       cgroup (since Linux 5.7)
881              This enables the generation of PERF_RECORD_CGROUP records when a
882              new cgroup is created (and activated).
883
884       text_poke (since Linux 5.8)
885              This  enables  the  generation  of PERF_RECORD_TEXT_POKE records
886              when there's a change to the kernel text  (i.e.,  self-modifying
887              code).
888
889       build_id (since Linux 5.12)
890              This  changes  the  contents  in the PERF_RECORD_MMAP2 to have a
891              build-id instead of device and inode numbers.
892
893       inherit_thread (since Linux 5.13)
894              This disables the inheritance of the event to a  child  process.
895              Only  new  threads  in  the  same  process (which is cloned with
896              CLONE_THREAD) will inherit the event.
897
898       remove_on_exec (since Linux 5.13)
899              This closes the event when it starts a new process image by  ex‐
900              ecve(2).
901
902       sigtrap (since Linux 5.13)
903              This  enables  synchronous  signal  delivery of SIGTRAP on event
904              overflow.
905
906       wakeup_events, wakeup_watermark
907              This union  sets  how  many  samples  (wakeup_events)  or  bytes
908              (wakeup_watermark)  happen  before an overflow notification hap‐
909              pens.  Which one is used is selected by the watermark bit flag.
910
911              wakeup_events counts only PERF_RECORD_SAMPLE record  types.   To
912              receive  overflow  notification for all PERF_RECORD types choose
913              watermark and set wakeup_watermark to 1.
914
915              Prior to Linux 3.0, setting wakeup_events to 0  resulted  in  no
916              overflow  notifications; more recent kernels treat 0 the same as
917              1.
918
919       bp_type (since Linux 2.6.33)
920              This chooses the breakpoint type.  It is one of:
921
922              HW_BREAKPOINT_EMPTY
923                     No breakpoint.
924
925              HW_BREAKPOINT_R
926                     Count when we read the memory location.
927
928              HW_BREAKPOINT_W
929                     Count when we write the memory location.
930
931              HW_BREAKPOINT_RW
932                     Count when we read or write the memory location.
933
934              HW_BREAKPOINT_X
935                     Count when we execute code at the memory location.
936
937              The values can be combined via a bitwise or, but the combination
938              of  HW_BREAKPOINT_R  or  HW_BREAKPOINT_W with HW_BREAKPOINT_X is
939              not allowed.
940
941       bp_addr (since Linux 2.6.33)
942              This is the address of the  breakpoint.   For  execution  break‐
943              points,  this is the memory address of the instruction of inter‐
944              est; for read and write breakpoints, it is the memory address of
945              the memory location of interest.
946
947       config1 (since Linux 2.6.39)
948              config1  is  used for setting events that need an extra register
949              or otherwise do not fit in the regular config field.   Raw  OFF‐
950              CORE_EVENTS  on  Nehalem/Westmere/SandyBridge  use this field on
951              Linux 3.3 and later kernels.
952
953       bp_len (since Linux 2.6.33)
954              bp_len is the length of the breakpoint being measured if type is
955              PERF_TYPE_BREAKPOINT.     Options    are    HW_BREAKPOINT_LEN_1,
956              HW_BREAKPOINT_LEN_2,    HW_BREAKPOINT_LEN_4,    and    HW_BREAK‐
957              POINT_LEN_8.    For   an   execution  breakpoint,  set  this  to
958              sizeof(long).
959
960       config2 (since Linux 2.6.39)
961              config2 is a further extension of the config1 field.
962
963       branch_sample_type (since Linux 3.4)
964              If PERF_SAMPLE_BRANCH_STACK is enabled, then this specifies what
965              branches to include in the branch record.
966
967              The  first  part of the value is the privilege level, which is a
968              combination of one of the values listed below.  If the user does
969              not  set  privilege  level  explicitly,  the kernel will use the
970              event's privilege level.  Event and branch privilege  levels  do
971              not have to match.
972
973              PERF_SAMPLE_BRANCH_USER
974                     Branch target is in user space.
975
976              PERF_SAMPLE_BRANCH_KERNEL
977                     Branch target is in kernel space.
978
979              PERF_SAMPLE_BRANCH_HV
980                     Branch target is in hypervisor.
981
982              PERF_SAMPLE_BRANCH_PLM_ALL
983                     A  convenience  value  that is the three preceding values
984                     ORed together.
985
986              In addition to the privilege value, at least one or more of  the
987              following bits must be set.
988
989              PERF_SAMPLE_BRANCH_ANY
990                     Any branch type.
991
992              PERF_SAMPLE_BRANCH_ANY_CALL
993                     Any  call  branch (includes direct calls, indirect calls,
994                     and far jumps).
995
996              PERF_SAMPLE_BRANCH_IND_CALL
997                     Indirect calls.
998
999              PERF_SAMPLE_BRANCH_CALL (since Linux 4.4)
1000                     Direct calls.
1001
1002              PERF_SAMPLE_BRANCH_ANY_RETURN
1003                     Any return branch.
1004
1005              PERF_SAMPLE_BRANCH_IND_JUMP (since Linux 4.2)
1006                     Indirect jumps.
1007
1008              PERF_SAMPLE_BRANCH_COND (since Linux 3.16)
1009                     Conditional branches.
1010
1011              PERF_SAMPLE_BRANCH_ABORT_TX (since Linux 3.11)
1012                     Transactional memory aborts.
1013
1014              PERF_SAMPLE_BRANCH_IN_TX (since Linux 3.11)
1015                     Branch in transactional memory transaction.
1016
1017              PERF_SAMPLE_BRANCH_NO_TX (since Linux 3.11)
1018                     Branch   not   in   transactional   memory   transaction.
1019                     PERF_SAMPLE_BRANCH_CALL_STACK (since Linux 4.1) Branch is
1020                     part of a hardware-generated call stack.   This  requires
1021                     hardware  support,  currently  only  found  on  Intel x86
1022                     Haswell or newer.
1023
1024       sample_regs_user (since Linux 3.7)
1025              This bit mask defines the set of user CPU registers to  dump  on
1026              samples.   The  layout of the register mask is architecture-spe‐
1027              cific and is described in the kernel header  file  arch/ARCH/in‐
1028              clude/uapi/asm/perf_regs.h.
1029
1030       sample_stack_user (since Linux 3.7)
1031              This  defines  the  size  of the user stack to dump if PERF_SAM‐
1032              PLE_STACK_USER is specified.
1033
1034       clockid (since Linux 4.1)
1035              If use_clockid is set, then this field  selects  which  internal
1036              Linux timer to use for timestamps.  The available timers are de‐
1037              fined  in  linux/time.h,   with   CLOCK_MONOTONIC,   CLOCK_MONO‐
1038              TONIC_RAW,  CLOCK_REALTIME,  CLOCK_BOOTTIME,  and CLOCK_TAI cur‐
1039              rently supported.
1040
1041       aux_watermark (since Linux 4.1)
1042              This  specifies  how  much  data  is  required  to   trigger   a
1043              PERF_RECORD_AUX sample.
1044
1045       sample_max_stack (since Linux 4.8)
1046              When  sample_type  includes  PERF_SAMPLE_CALLCHAIN,  this  field
1047              specifies how many stack frames to report  when  generating  the
1048              callchain.
1049
1050       aux_sample_size (since Linux 5.5)
1051              When  PERF_SAMPLE_AUX  flag  is set, specify the desired size of
1052              AUX data.  Note that it can get smaller data than the  specified
1053              size.
1054
1055       sig_data (since Linux 5.13)
1056              This  data  will  be  copied  to  user's signal handler (through
1057              si_perf in the siginfo_t) to disambiguate which event  triggered
1058              the signal.
1059
1060   Reading results
1061       Once a perf_event_open() file descriptor has been opened, the values of
1062       the events can be read from the file descriptor.  The values  that  are
1063       there  are  specified by the read_format field in the attr structure at
1064       open time.
1065
1066       If you attempt to read into a buffer that is not big enough to hold the
1067       data, the error ENOSPC results.
1068
1069       Here is the layout of the data returned by a read:
1070
1071       •  If  PERF_FORMAT_GROUP was specified to allow reading all events in a
1072          group at once:
1073
1074              struct read_format {
1075                  u64 nr;            /* The number of events */
1076                  u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1077                  u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1078                  struct {
1079                      u64 value;     /* The value of the event */
1080                      u64 id;        /* if PERF_FORMAT_ID */
1081                      u64 lost;      /* if PERF_FORMAT_LOST */
1082                  } values[nr];
1083              };
1084
1085       •  If PERF_FORMAT_GROUP was not specified:
1086
1087              struct read_format {
1088                  u64 value;         /* The value of the event */
1089                  u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1090                  u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1091                  u64 id;            /* if PERF_FORMAT_ID */
1092                  u64 lost;          /* if PERF_FORMAT_LOST */
1093              };
1094
1095       The values read are as follows:
1096
1097       nr     The number of events in this file descriptor.  Available only if
1098              PERF_FORMAT_GROUP was specified.
1099
1100       time_enabled, time_running
1101              Total  time  the  event was enabled and running.  Normally these
1102              values are the same.  Multiplexing  happens  if  the  number  of
1103              events  is  more than the number of available PMU counter slots.
1104              In that case the events run  only  part  of  the  time  and  the
1105              time_enabled and time running values can be used to scale an es‐
1106              timated value for the count.
1107
1108       value  An unsigned 64-bit value containing the counter result.
1109
1110       id     A globally unique value for this particular event; only  present
1111              if PERF_FORMAT_ID was specified in read_format.
1112
1113       lost   The  number  of  lost  samples  of  this  event; only present if
1114              PERF_FORMAT_LOST was specified in read_format.
1115
1116   MMAP layout
1117       When using perf_event_open() in sampled mode, asynchronous events (like
1118       counter  overflow  or  PROT_EXEC mmap tracking) are logged into a ring-
1119       buffer.  This ring-buffer is created and accessed through mmap(2).
1120
1121       The mmap size should be 1+2^n pages, where the first page is a metadata
1122       page (struct perf_event_mmap_page) that contains various bits of infor‐
1123       mation such as where the ring-buffer head is.
1124
1125       Before Linux 2.6.39, there is a bug that means  you  must  allocate  an
1126       mmap ring buffer when sampling even if you do not plan to access it.
1127
1128       The structure of the first metadata mmap page is as follows:
1129
1130           struct perf_event_mmap_page {
1131               __u32 version;        /* version number of this structure */
1132               __u32 compat_version; /* lowest version this is compat with */
1133               __u32 lock;           /* seqlock for synchronization */
1134               __u32 index;          /* hardware counter identifier */
1135               __s64 offset;         /* add to hardware counter value */
1136               __u64 time_enabled;   /* time event active */
1137               __u64 time_running;   /* time event on CPU */
1138               union {
1139                   __u64   capabilities;
1140                   struct {
1141                       __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
1142                             cap_bit0_is_deprecated : 1,
1143                             cap_user_rdpmc         : 1,
1144                             cap_user_time          : 1,
1145                             cap_user_time_zero     : 1,
1146                   };
1147               };
1148               __u16 pmc_width;
1149               __u16 time_shift;
1150               __u32 time_mult;
1151               __u64 time_offset;
1152               __u64 __reserved[120];   /* Pad to 1 k */
1153               __u64 data_head;         /* head in the data section */
1154               __u64 data_tail;         /* user-space written tail */
1155               __u64 data_offset;       /* where the buffer starts */
1156               __u64 data_size;         /* data buffer size */
1157               __u64 aux_head;
1158               __u64 aux_tail;
1159               __u64 aux_offset;
1160               __u64 aux_size;
1161
1162           }
1163
1164       The  following  list  describes  the fields in the perf_event_mmap_page
1165       structure in more detail:
1166
1167       version
1168              Version number of this structure.
1169
1170       compat_version
1171              The lowest version this is compatible with.
1172
1173       lock   A seqlock for synchronization.
1174
1175       index  A unique hardware counter identifier.
1176
1177       offset When using rdpmc for reads this offset value must  be  added  to
1178              the one returned by rdpmc to get the current total event count.
1179
1180       time_enabled
1181              Time the event was active.
1182
1183       time_running
1184              Time the event was running.
1185
1186       cap_usr_time / cap_usr_rdpmc / cap_bit0 (since Linux 3.4)
1187              There   was   a  bug  in  the  definition  of  cap_usr_time  and
1188              cap_usr_rdpmc from Linux 3.4 until Linux 3.11.  Both  bits  were
1189              defined  to  point to the same location, so it was impossible to
1190              know if cap_usr_time or cap_usr_rdpmc were actually set.
1191
1192              Starting with Linux 3.12, these are renamed to cap_bit0 and  you
1193              should use the cap_user_time and cap_user_rdpmc fields instead.
1194
1195       cap_bit0_is_deprecated (since Linux 3.12)
1196              If set, this bit indicates that the kernel supports the properly
1197              separated cap_user_time and cap_user_rdpmc bits.
1198
1199              If not-set, it indicates an older kernel where cap_usr_time  and
1200              cap_usr_rdpmc  map to the same bit and thus both features should
1201              be used with caution.
1202
1203       cap_user_rdpmc (since Linux 3.12)
1204              If the hardware supports user-space read of performance counters
1205              without  syscall  (this is the "rdpmc" instruction on x86), then
1206              the following code can be used to do a read:
1207
1208                  u32 seq, time_mult, time_shift, idx, width;
1209                  u64 count, enabled, running;
1210                  u64 cyc, time_offset;
1211
1212                  do {
1213                      seq = pc->lock;
1214                      barrier();
1215                      enabled = pc->time_enabled;
1216                      running = pc->time_running;
1217
1218                      if (pc->cap_usr_time && enabled != running) {
1219                          cyc = rdtsc();
1220                          time_offset = pc->time_offset;
1221                          time_mult   = pc->time_mult;
1222                          time_shift  = pc->time_shift;
1223                      }
1224
1225                      idx = pc->index;
1226                      count = pc->offset;
1227
1228                      if (pc->cap_usr_rdpmc && idx) {
1229                          width = pc->pmc_width;
1230                          count += rdpmc(idx - 1);
1231                      }
1232
1233                      barrier();
1234                  } while (pc->lock != seq);
1235
1236       cap_user_time (since Linux 3.12)
1237              This bit indicates the hardware has a  constant,  nonstop  time‐
1238              stamp counter (TSC on x86).
1239
1240       cap_user_time_zero (since Linux 3.12)
1241              Indicates  the  presence of time_zero which allows mapping time‐
1242              stamp values to the hardware clock.
1243
1244       pmc_width
1245              If cap_usr_rdpmc, this field provides the bit-width of the value
1246              read  using  the  rdpmc  or equivalent instruction.  This can be
1247              used to sign extend the result like:
1248
1249                  pmc <<= 64 - pmc_width;
1250                  pmc >>= 64 - pmc_width; // signed shift right
1251                  count += pmc;
1252
1253       time_shift, time_mult, time_offset
1254
1255              If cap_usr_time, these fields can be used to  compute  the  time
1256              delta  since  time_enabled (in nanoseconds) using rdtsc or simi‐
1257              lar.
1258
1259                  u64 quot, rem;
1260                  u64 delta;
1261
1262                  quot  = cyc >> time_shift;
1263                  rem   = cyc & (((u64)1 << time_shift) - 1);
1264                  delta = time_offset + quot * time_mult +
1265                          ((rem * time_mult) >> time_shift);
1266
1267              Where time_offset, time_mult, time_shift, and cyc  are  read  in
1268              the seqcount loop described above.  This delta can then be added
1269              to enabled and possible running (if idx), improving the scaling:
1270
1271                  enabled += delta;
1272                  if (idx)
1273                      running += delta;
1274                  quot  = count / running;
1275                  rem   = count % running;
1276                  count = quot * enabled + (rem * enabled) / running;
1277
1278       time_zero (since Linux 3.12)
1279
1280              If cap_usr_time_zero is set, then the hardware  clock  (the  TSC
1281              timestamp  counter on x86) can be calculated from the time_zero,
1282              time_mult, and time_shift values:
1283
1284                  time = timestamp - time_zero;
1285                  quot = time / time_mult;
1286                  rem  = time % time_mult;
1287                  cyc  = (quot << time_shift) + (rem << time_shift) / time_mult;
1288
1289              And vice versa:
1290
1291                  quot = cyc >> time_shift;
1292                  rem  = cyc & (((u64)1 << time_shift) - 1);
1293                  timestamp = time_zero + quot * time_mult +
1294                              ((rem * time_mult) >> time_shift);
1295
1296       data_head
1297              This points to the head of the data section.  The value continu‐
1298              ously  increases, it does not wrap.  The value needs to be manu‐
1299              ally wrapped by the size of the mmap buffer before accessing the
1300              samples.
1301
1302              On  SMP-capable  platforms,  after  reading the data_head value,
1303              user space should issue an rmb().
1304
1305       data_tail
1306              When the mapping is PROT_WRITE, the data_tail  value  should  be
1307              written  by  user  space to reflect the last read data.  In this
1308              case, the kernel will not overwrite unread data.
1309
1310       data_offset (since Linux 4.1)
1311              Contains the offset of the location in  the  mmap  buffer  where
1312              perf sample data begins.
1313
1314       data_size (since Linux 4.1)
1315              Contains the size of the perf sample region within the mmap buf‐
1316              fer.
1317
1318       aux_head, aux_tail, aux_offset, aux_size (since Linux 4.1)
1319              The AUX region allows mmap(2)-ing a separate sample  buffer  for
1320              high-bandwidth  data streams (separate from the main perf sample
1321              buffer).  An example of a high-bandwidth stream  is  instruction
1322              tracing support, as is found in newer Intel processors.
1323
1324              To  set up an AUX area, first aux_offset needs to be set with an
1325              offset greater than data_offset+data_size and aux_size needs  to
1326              be  set to the desired buffer size.  The desired offset and size
1327              must be page aligned, and the size  must  be  a  power  of  two.
1328              These  values  are  then  passed to mmap in order to map the AUX
1329              buffer.  Pages in the AUX buffer are included  as  part  of  the
1330              RLIMIT_MEMLOCK  resource  limit  (see setrlimit(2)), and also as
1331              part of the perf_event_mlock_kb allowance.
1332
1333              By default, the AUX buffer will be truncated if it will not  fit
1334              in the available space in the ring buffer.  If the AUX buffer is
1335              mapped as a read only buffer, then it will operate in ring  buf‐
1336              fer  mode  where  old data will be overwritten by new.  In over‐
1337              write mode, it might not be possible to infer where the new data
1338              began, and it is the consumer's job to disable measurement while
1339              reading to avoid possible data races.
1340
1341              The aux_head and aux_tail ring buffer pointers have the same be‐
1342              havior  and  ordering  rules as the previous described data_head
1343              and data_tail.
1344
1345       The following 2^n ring-buffer pages have the layout described below.
1346
1347       If perf_event_attr.sample_id_all is set, then all event types will have
1348       the  sample_type  selected  fields  related to where/when (identity) an
1349       event  took  place  (TID,  TIME,  ID,  CPU,  STREAM_ID)  described   in
1350       PERF_RECORD_SAMPLE   below,   it   will   be  stashed  just  after  the
1351       perf_event_header and the  fields  already  present  for  the  existing
1352       fields,  that  is,  at  the  end  of  the payload.  This allows a newer
1353       perf.data file to be supported by older perf tools, with  the  new  op‐
1354       tional fields being ignored.
1355
1356       The mmap values start with a header:
1357
1358           struct perf_event_header {
1359               __u32   type;
1360               __u16   misc;
1361               __u16   size;
1362           };
1363
1364       Below,  we  describe  the perf_event_header fields in more detail.  For
1365       ease of reading, the fields with  shorter  descriptions  are  presented
1366       first.
1367
1368       size   This indicates the size of the record.
1369
1370       misc   The misc field contains additional information about the sample.
1371
1372              The  CPU  mode can be determined from this value by masking with
1373              PERF_RECORD_MISC_CPUMODE_MASK and looking for one of the follow‐
1374              ing  (note  these  are  not  bit masks, only one can be set at a
1375              time):
1376
1377              PERF_RECORD_MISC_CPUMODE_UNKNOWN
1378                     Unknown CPU mode.
1379
1380              PERF_RECORD_MISC_KERNEL
1381                     Sample happened in the kernel.
1382
1383              PERF_RECORD_MISC_USER
1384                     Sample happened in user code.
1385
1386              PERF_RECORD_MISC_HYPERVISOR
1387                     Sample happened in the hypervisor.
1388
1389              PERF_RECORD_MISC_GUEST_KERNEL (since Linux 2.6.35)
1390                     Sample happened in the guest kernel.
1391
1392              PERF_RECORD_MISC_GUEST_USER  (since Linux 2.6.35)
1393                     Sample happened in guest user code.
1394
1395              Since the following three statuses are  generated  by  different
1396              record types, they alias to the same bit:
1397
1398              PERF_RECORD_MISC_MMAP_DATA (since Linux 3.10)
1399                     This is set when the mapping is not executable; otherwise
1400                     the mapping is executable.
1401
1402              PERF_RECORD_MISC_COMM_EXEC (since Linux 3.16)
1403                     This is set for a PERF_RECORD_COMM record on kernels more
1404                     recent  than  Linux  3.16  if  a  process name change was
1405                     caused by an execve(2) system call.
1406
1407              PERF_RECORD_MISC_SWITCH_OUT (since Linux 4.3)
1408                     When a PERF_RECORD_SWITCH or  PERF_RECORD_SWITCH_CPU_WIDE
1409                     record  is generated, this bit indicates that the context
1410                     switch is away from the current process (instead of  into
1411                     the current process).
1412
1413              In addition, the following bits can be set:
1414
1415              PERF_RECORD_MISC_EXACT_IP
1416                     This  indicates that the content of PERF_SAMPLE_IP points
1417                     to the actual instruction that triggered the event.   See
1418                     also perf_event_attr.precise_ip.
1419
1420              PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (since Linux 4.17)
1421                     When  a PERF_RECORD_SWITCH or PERF_RECORD_SWITCH_CPU_WIDE
1422                     record is generated, this indicates  the  context  switch
1423                     was a preemption.
1424
1425              PERF_RECORD_MISC_MMAP_BUILD_ID (since Linux 5.12)
1426                     This indicates that the content of PERF_SAMPLE_MMAP2 con‐
1427                     tains build-ID data instead of  device  major  and  minor
1428                     numbers as well as the inode number.
1429
1430              PERF_RECORD_MISC_EXT_RESERVED (since Linux 2.6.35)
1431                     This  indicates  there  is  extended data available (cur‐
1432                     rently not used).
1433
1434              PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT
1435                     This bit is not set by the kernel.  It  is  reserved  for
1436                     the    user-space   perf   utility   to   indicate   that
1437                     /proc/pid/maps  parsing  was  taking  too  long  and  was
1438                     stopped, and thus the mmap records may be truncated.
1439
1440       type   The  type  value  is one of the below.  The values in the corre‐
1441              sponding record (that follows the header) depend on the type se‐
1442              lected as shown.
1443
1444              PERF_RECORD_MMAP
1445                  The MMAP events record the PROT_EXEC mappings so that we can
1446                  correlate user-space IPs to code.  They have  the  following
1447                  structure:
1448
1449                      struct {
1450                          struct perf_event_header header;
1451                          u32    pid, tid;
1452                          u64    addr;
1453                          u64    len;
1454                          u64    pgoff;
1455                          char   filename[];
1456                      };
1457
1458                  pid    is the process ID.
1459
1460                  tid    is the thread ID.
1461
1462                  addr   is  the  address of the allocated memory.  len is the
1463                         length of the allocated memory.  pgoff  is  the  page
1464                         offset of the allocated memory.  filename is a string
1465                         describing the backing of the allocated memory.
1466
1467              PERF_RECORD_LOST
1468                  This record indicates when events are lost.
1469
1470                      struct {
1471                          struct perf_event_header header;
1472                          u64    id;
1473                          u64    lost;
1474                          struct sample_id sample_id;
1475                      };
1476
1477                  id     is the unique event ID  for  the  samples  that  were
1478                         lost.
1479
1480                  lost   is the number of events that were lost.
1481
1482              PERF_RECORD_COMM
1483                  This record indicates a change in the process name.
1484
1485                      struct {
1486                          struct perf_event_header header;
1487                          u32    pid;
1488                          u32    tid;
1489                          char   comm[];
1490                          struct sample_id sample_id;
1491                      };
1492
1493                  pid    is the process ID.
1494
1495                  tid    is the thread ID.
1496
1497                  comm   is a string containing the new name of the process.
1498
1499              PERF_RECORD_EXIT
1500                  This record indicates a process exit event.
1501
1502                      struct {
1503                          struct perf_event_header header;
1504                          u32    pid, ppid;
1505                          u32    tid, ptid;
1506                          u64    time;
1507                          struct sample_id sample_id;
1508                      };
1509
1510              PERF_RECORD_THROTTLE, PERF_RECORD_UNTHROTTLE
1511                  This record indicates a throttle/unthrottle event.
1512
1513                      struct {
1514                          struct perf_event_header header;
1515                          u64    time;
1516                          u64    id;
1517                          u64    stream_id;
1518                          struct sample_id sample_id;
1519                      };
1520
1521              PERF_RECORD_FORK
1522                  This record indicates a fork event.
1523
1524                      struct {
1525                          struct perf_event_header header;
1526                          u32    pid, ppid;
1527                          u32    tid, ptid;
1528                          u64    time;
1529                          struct sample_id sample_id;
1530                      };
1531
1532              PERF_RECORD_READ
1533                  This record indicates a read event.
1534
1535                      struct {
1536                          struct perf_event_header header;
1537                          u32    pid, tid;
1538                          struct read_format values;
1539                          struct sample_id sample_id;
1540                      };
1541
1542              PERF_RECORD_SAMPLE
1543                  This record indicates a sample.
1544
1545                      struct {
1546                          struct perf_event_header header;
1547                          u64    sample_id;   /* if PERF_SAMPLE_IDENTIFIER */
1548                          u64    ip;          /* if PERF_SAMPLE_IP */
1549                          u32    pid, tid;    /* if PERF_SAMPLE_TID */
1550                          u64    time;        /* if PERF_SAMPLE_TIME */
1551                          u64    addr;        /* if PERF_SAMPLE_ADDR */
1552                          u64    id;          /* if PERF_SAMPLE_ID */
1553                          u64    stream_id;   /* if PERF_SAMPLE_STREAM_ID */
1554                          u32    cpu, res;    /* if PERF_SAMPLE_CPU */
1555                          u64    period;      /* if PERF_SAMPLE_PERIOD */
1556                          struct read_format v;
1557                                              /* if PERF_SAMPLE_READ */
1558                          u64    nr;          /* if PERF_SAMPLE_CALLCHAIN */
1559                          u64    ips[nr];     /* if PERF_SAMPLE_CALLCHAIN */
1560                          u32    size;        /* if PERF_SAMPLE_RAW */
1561                          char   data[size];  /* if PERF_SAMPLE_RAW */
1562                          u64    bnr;         /* if PERF_SAMPLE_BRANCH_STACK */
1563                          struct perf_branch_entry lbr[bnr];
1564                                              /* if PERF_SAMPLE_BRANCH_STACK */
1565                          u64    abi;         /* if PERF_SAMPLE_REGS_USER */
1566                          u64    regs[weight(mask)];
1567                                              /* if PERF_SAMPLE_REGS_USER */
1568                          u64    size;        /* if PERF_SAMPLE_STACK_USER */
1569                          char   data[size];  /* if PERF_SAMPLE_STACK_USER */
1570                          u64    dyn_size;    /* if PERF_SAMPLE_STACK_USER &&
1571                                                 size != 0 */
1572                          union perf_sample_weight weight;
1573                                              /* if PERF_SAMPLE_WEIGHT */
1574                                              /* || PERF_SAMPLE_WEIGHT_STRUCT */
1575                          u64    data_src;    /* if PERF_SAMPLE_DATA_SRC */
1576                          u64    transaction; /* if PERF_SAMPLE_TRANSACTION */
1577                          u64    abi;         /* if PERF_SAMPLE_REGS_INTR */
1578                          u64    regs[weight(mask)];
1579                                              /* if PERF_SAMPLE_REGS_INTR */
1580                          u64    phys_addr;   /* if PERF_SAMPLE_PHYS_ADDR */
1581                          u64    cgroup;      /* if PERF_SAMPLE_CGROUP */
1582                          u64    data_page_size;
1583                                            /* if PERF_SAMPLE_DATA_PAGE_SIZE */
1584                          u64    code_page_size;
1585                                            /* if PERF_SAMPLE_CODE_PAGE_SIZE */
1586                          u64    size;        /* if PERF_SAMPLE_AUX */
1587                          char   data[size];  /* if PERF_SAMPLE_AUX */
1588                      };
1589
1590                  sample_id
1591                      If PERF_SAMPLE_IDENTIFIER is enabled, a 64-bit unique ID
1592                      is included.  This is a  duplication  of  the  PERF_SAM‐
1593                      PLE_ID  id  value,  but included at the beginning of the
1594                      sample so parsers can easily obtain the value.
1595
1596                  ip  If PERF_SAMPLE_IP is enabled, then a 64-bit  instruction
1597                      pointer value is included.
1598
1599                  pid, tid
1600                      If  PERF_SAMPLE_TID is enabled, then a 32-bit process ID
1601                      and 32-bit thread ID are included.
1602
1603                  time
1604                      If PERF_SAMPLE_TIME is enabled, then a 64-bit  timestamp
1605                      is  included.   This is obtained via local_clock() which
1606                      is a hardware timestamp if  available  and  the  jiffies
1607                      value if not.
1608
1609                  addr
1610                      If PERF_SAMPLE_ADDR is enabled, then a 64-bit address is
1611                      included.  This is usually the address of a  tracepoint,
1612                      breakpoint, or software event; otherwise the value is 0.
1613
1614                  id  If  PERF_SAMPLE_ID is enabled, a 64-bit unique ID is in‐
1615                      cluded.  If the event is a member of an event group, the
1616                      group leader ID is returned.  This ID is the same as the
1617                      one returned by PERF_FORMAT_ID.
1618
1619                  stream_id
1620                      If PERF_SAMPLE_STREAM_ID is enabled, a 64-bit unique  ID
1621                      is included.  Unlike PERF_SAMPLE_ID the actual ID is re‐
1622                      turned, not the group leader.  This ID is  the  same  as
1623                      the one returned by PERF_FORMAT_ID.
1624
1625                  cpu, res
1626                      If  PERF_SAMPLE_CPU  is  enabled, this is a 32-bit value
1627                      indicating which CPU was being used, in  addition  to  a
1628                      reserved (unused) 32-bit value.
1629
1630                  period
1631                      If  PERF_SAMPLE_PERIOD  is enabled, a 64-bit value indi‐
1632                      cating the current sampling period is written.
1633
1634                  v   If PERF_SAMPLE_READ is  enabled,  a  structure  of  type
1635                      read_format  is included which has values for all events
1636                      in the event group.  The values included depend  on  the
1637                      read_format value used at perf_event_open() time.
1638
1639                  nr, ips[nr]
1640                      If  PERF_SAMPLE_CALLCHAIN is enabled, then a 64-bit num‐
1641                      ber is  included  which  indicates  how  many  following
1642                      64-bit  instruction  pointers  will follow.  This is the
1643                      current callchain.
1644
1645                  size, data[size]
1646                      If PERF_SAMPLE_RAW is enabled, then a 32-bit value indi‐
1647                      cating  size  is  included followed by an array of 8-bit
1648                      values of length size.  The values are padded with 0  to
1649                      have 64-bit alignment.
1650
1651                      This  RAW record data is opaque with respect to the ABI.
1652                      The ABI doesn't make any promises with  respect  to  the
1653                      stability  of  its  content,  it  may  vary depending on
1654                      event, hardware, and kernel version.
1655
1656                  bnr, lbr[bnr]
1657                      If PERF_SAMPLE_BRANCH_STACK is enabled,  then  a  64-bit
1658                      value indicating the number of records is included, fol‐
1659                      lowed by bnr perf_branch_entry structures which each in‐
1660                      clude the fields:
1661
1662                      from   This indicates the source instruction (may not be
1663                             a branch).
1664
1665                      to     The branch target.
1666
1667                      mispred
1668                             The branch target was mispredicted.
1669
1670                      predicted
1671                             The branch target was predicted.
1672
1673                      in_tx (since Linux 3.11)
1674                             The branch was in a transactional memory transac‐
1675                             tion.
1676
1677                      abort (since Linux 3.11)
1678                             The branch was in an aborted transactional memory
1679                             transaction.
1680
1681                      cycles (since Linux 4.3)
1682                             This reports the number of cycles  elapsed  since
1683                             the previous branch stack update.
1684
1685                      The  entries are from most to least recent, so the first
1686                      entry has the most recent branch.
1687
1688                      Support for mispred, predicted, and cycles is  optional;
1689                      if not supported, those values will be 0.
1690
1691                      The  type  of  branches  recorded  is  specified  by the
1692                      branch_sample_type field.
1693
1694                  abi, regs[weight(mask)]
1695                      If PERF_SAMPLE_REGS_USER is enabled, then the  user  CPU
1696                      registers are recorded.
1697
1698                      The  abi  field  is  one  of  PERF_SAMPLE_REGS_ABI_NONE,
1699                      PERF_SAMPLE_REGS_ABI_32, or PERF_SAMPLE_REGS_ABI_64.
1700
1701                      The regs field is an array of  the  CPU  registers  that
1702                      were  specified by the sample_regs_user attr field.  The
1703                      number of values is the number of bits set in  the  sam‐
1704                      ple_regs_user bit mask.
1705
1706                  size, data[size], dyn_size
1707                      If  PERF_SAMPLE_STACK_USER  is  enabled,  then  the user
1708                      stack is recorded.  This can be used to  generate  stack
1709                      backtraces.   size  is the size requested by the user in
1710                      sample_stack_user or else the maximum record size.  data
1711                      is  the  stack data (a raw dump of the memory pointed to
1712                      by the stack pointer at the time of sampling).  dyn_size
1713                      is  the amount of data actually dumped (can be less than
1714                      size).  Note that dyn_size is omitted if size is 0.
1715
1716                  weight
1717                      If PERF_SAMPLE_WEIGHT  or  PERF_SAMPLE_WEIGHT_STRUCT  is
1718                      enabled, then a 64-bit value provided by the hardware is
1719                      recorded that indicates how costly the event was.   This
1720                      allows  expensive  events  to  stand out more clearly in
1721                      profiles.
1722
1723                  data_src
1724                      If PERF_SAMPLE_DATA_SRC is enabled, then a 64-bit  value
1725                      is recorded that is made up of the following fields:
1726
1727                      mem_op
1728                          Type of opcode, a bitwise combination of:
1729
1730                          PERF_MEM_OP_NA          Not available
1731                          PERF_MEM_OP_LOAD        Load instruction
1732                          PERF_MEM_OP_STORE       Store instruction
1733                          PERF_MEM_OP_PFETCH      Prefetch
1734                          PERF_MEM_OP_EXEC        Executable code
1735
1736                      mem_lvl
1737                          Memory hierarchy level hit or miss, a bitwise combi‐
1738                          nation   of   the   following,   shifted   left   by
1739                          PERF_MEM_LVL_SHIFT:
1740
1741                          PERF_MEM_LVL_NA         Not available
1742                          PERF_MEM_LVL_HIT        Hit
1743                          PERF_MEM_LVL_MISS       Miss
1744                          PERF_MEM_LVL_L1         Level 1 cache
1745                          PERF_MEM_LVL_LFB        Line fill buffer
1746                          PERF_MEM_LVL_L2         Level 2 cache
1747                          PERF_MEM_LVL_L3         Level 3 cache
1748                          PERF_MEM_LVL_LOC_RAM    Local DRAM
1749                          PERF_MEM_LVL_REM_RAM1   Remote DRAM 1 hop
1750                          PERF_MEM_LVL_REM_RAM2   Remote DRAM 2 hops
1751                          PERF_MEM_LVL_REM_CCE1   Remote cache 1 hop
1752                          PERF_MEM_LVL_REM_CCE2   Remote cache 2 hops
1753                          PERF_MEM_LVL_IO         I/O memory
1754                          PERF_MEM_LVL_UNC        Uncached memory
1755
1756                      mem_snoop
1757                          Snoop  mode, a bitwise combination of the following,
1758                          shifted left by PERF_MEM_SNOOP_SHIFT:
1759
1760                          PERF_MEM_SNOOP_NA       Not available
1761                          PERF_MEM_SNOOP_NONE     No snoop
1762                          PERF_MEM_SNOOP_HIT      Snoop hit
1763                          PERF_MEM_SNOOP_MISS     Snoop miss
1764                          PERF_MEM_SNOOP_HITM     Snoop hit modified
1765
1766                      mem_lock
1767                          Lock instruction, a bitwise combination of the  fol‐
1768                          lowing, shifted left by PERF_MEM_LOCK_SHIFT:
1769
1770                          PERF_MEM_LOCK_NA        Not available
1771                          PERF_MEM_LOCK_LOCKED    Locked transaction
1772
1773                      mem_dtlb
1774                          TLB access hit or miss, a bitwise combination of the
1775                          following, shifted left by PERF_MEM_TLB_SHIFT:
1776
1777                          PERF_MEM_TLB_NA         Not available
1778                          PERF_MEM_TLB_HIT        Hit
1779                          PERF_MEM_TLB_MISS       Miss
1780                          PERF_MEM_TLB_L1         Level 1 TLB
1781                          PERF_MEM_TLB_L2         Level 2 TLB
1782                          PERF_MEM_TLB_WK         Hardware walker
1783                          PERF_MEM_TLB_OS         OS fault handler
1784
1785                  transaction
1786                      If the  PERF_SAMPLE_TRANSACTION  flag  is  set,  then  a
1787                      64-bit  field  is recorded describing the sources of any
1788                      transactional memory aborts.
1789
1790                      The field is a bitwise combination of the following val‐
1791                      ues:
1792
1793                      PERF_TXN_ELISION
1794                             Abort  from  an  elision type transaction (Intel-
1795                             CPU-specific).
1796
1797                      PERF_TXN_TRANSACTION
1798                             Abort from a generic transaction.
1799
1800                      PERF_TXN_SYNC
1801                             Synchronous abort (related to  the  reported  in‐
1802                             struction).
1803
1804                      PERF_TXN_ASYNC
1805                             Asynchronous  abort  (not related to the reported
1806                             instruction).
1807
1808                      PERF_TXN_RETRY
1809                             Retryable abort  (retrying  the  transaction  may
1810                             have succeeded).
1811
1812                      PERF_TXN_CONFLICT
1813                             Abort due to memory conflicts with other threads.
1814
1815                      PERF_TXN_CAPACITY_WRITE
1816                             Abort due to write capacity overflow.
1817
1818                      PERF_TXN_CAPACITY_READ
1819                             Abort due to read capacity overflow.
1820
1821                      In addition, a user-specified abort code can be obtained
1822                      from the high 32 bits of the field by shifting right  by
1823                      PERF_TXN_ABORT_SHIFT   and   masking   with   the  value
1824                      PERF_TXN_ABORT_MASK.
1825
1826                  abi, regs[weight(mask)]
1827                      If PERF_SAMPLE_REGS_INTR is enabled, then the  user  CPU
1828                      registers are recorded.
1829
1830                      The  abi  field  is  one  of  PERF_SAMPLE_REGS_ABI_NONE,
1831                      PERF_SAMPLE_REGS_ABI_32, or PERF_SAMPLE_REGS_ABI_64.
1832
1833                      The regs field is an array of  the  CPU  registers  that
1834                      were  specified by the sample_regs_intr attr field.  The
1835                      number of values is the number of bits set in  the  sam‐
1836                      ple_regs_intr bit mask.
1837
1838                  phys_addr
1839                      If  the  PERF_SAMPLE_PHYS_ADDR  flag  is  set,  then the
1840                      64-bit physical address is recorded.
1841
1842                  cgroup
1843                      If the PERF_SAMPLE_CGROUP flag is set, then  the  64-bit
1844                      cgroup  ID  (for  the perf_event subsystem) is recorded.
1845                      To get the pathname of the cgroup, the ID  should  match
1846                      to one in a PERF_RECORD_CGROUP.
1847
1848                  data_page_size
1849                      If  the PERF_SAMPLE_DATA_PAGE_SIZE flag is set, then the
1850                      64-bit page size value of the data address is recorded.
1851
1852                  code_page_size
1853                      If the PERF_SAMPLE_CODE_PAGE_SIZE flag is set, then  the
1854                      64-bit page size value of the ip address is recorded.
1855
1856                  size
1857                  data[size]
1858                      If  PERF_SAMPLE_AUX  is  enabled,  a snapshot of the aux
1859                      buffer is recorded.
1860
1861              PERF_RECORD_MMAP2
1862                  This record includes extended information on  mmap(2)  calls
1863                  returning  executable  mappings.   The  format is similar to
1864                  that of the PERF_RECORD_MMAP record, but includes extra val‐
1865                  ues  that  allow  uniquely identifying shared mappings.  De‐
1866                  pending on the  PERF_RECORD_MISC_MMAP_BUILD_ID  bit  in  the
1867                  header, the extra values have different layout and meanings.
1868
1869                      struct {
1870                          struct perf_event_header header;
1871                          u32    pid;
1872                          u32    tid;
1873                          u64    addr;
1874                          u64    len;
1875                          u64    pgoff;
1876                          union {
1877                              struct {
1878                                  u32    maj;
1879                                  u32    min;
1880                                  u64    ino;
1881                                  u64    ino_generation;
1882                              };
1883                              struct {   /* if PERF_RECORD_MISC_MMAP_BUILD_ID */
1884                                  u8     build_id_size;
1885                                  u8     __reserved_1;
1886                                  u16    __reserved_2;
1887                                  u8     build_id[20];
1888                              };
1889                          };
1890                          u32    prot;
1891                          u32    flags;
1892                          char   filename[];
1893                          struct sample_id sample_id;
1894                      };
1895
1896                  pid    is the process ID.
1897
1898                  tid    is the thread ID.
1899
1900                  addr   is the address of the allocated memory.
1901
1902                  len    is the length of the allocated memory.
1903
1904                  pgoff  is the page offset of the allocated memory.
1905
1906                  maj    is the major ID of the underlying device.
1907
1908                  min    is the minor ID of the underlying device.
1909
1910                  ino    is the inode number.
1911
1912                  ino_generation
1913                         is the inode generation.
1914
1915                  build_id_size
1916                         is the actual size of build_id field (up to 20).
1917
1918                  build_id
1919                         is a raw data to identify a binary.
1920
1921                  prot   is the protection information.
1922
1923                  flags  is the flags information.
1924
1925                  filename
1926                         is  a  string describing the backing of the allocated
1927                         memory.
1928
1929              PERF_RECORD_AUX (since Linux 4.1)
1930                  This record reports that new data is available in the  sepa‐
1931                  rate AUX buffer region.
1932
1933                      struct {
1934                          struct perf_event_header header;
1935                          u64    aux_offset;
1936                          u64    aux_size;
1937                          u64    flags;
1938                          struct sample_id sample_id;
1939                      };
1940
1941                  aux_offset
1942                         offset  in the AUX mmap region where the new data be‐
1943                         gins.
1944
1945                  aux_size
1946                         size of the data made available.
1947
1948                  flags  describes the AUX update.
1949
1950                         PERF_AUX_FLAG_TRUNCATED
1951                                if set, then the data returned  was  truncated
1952                                to fit the available buffer size.
1953
1954                         PERF_AUX_FLAG_OVERWRITE
1955                                if set, then the data returned has overwritten
1956                                previous data.
1957
1958              PERF_RECORD_ITRACE_START (since Linux 4.1)
1959                  This record indicates which process  has  initiated  an  in‐
1960                  struction  trace event, allowing tools to properly correlate
1961                  the instruction addresses in the AUX buffer with the  proper
1962                  executable.
1963
1964                      struct {
1965                          struct perf_event_header header;
1966                          u32    pid;
1967                          u32    tid;
1968                      };
1969
1970                  pid    process  ID  of  the  thread  starting an instruction
1971                         trace.
1972
1973                  tid    thread ID  of  the  thread  starting  an  instruction
1974                         trace.
1975
1976              PERF_RECORD_LOST_SAMPLES (since Linux 4.2)
1977                  When  using  hardware  sampling  (such  as  Intel PEBS) this
1978                  record indicates some number of samples that may  have  been
1979                  lost.
1980
1981                      struct {
1982                          struct perf_event_header header;
1983                          u64    lost;
1984                          struct sample_id sample_id;
1985                      };
1986
1987                  lost   the number of potentially lost samples.
1988
1989              PERF_RECORD_SWITCH (since Linux 4.3)
1990                  This  record  indicates  a context switch has happened.  The
1991                  PERF_RECORD_MISC_SWITCH_OUT bit in the misc field  indicates
1992                  whether  it  was a context switch into or away from the cur‐
1993                  rent process.
1994
1995                      struct {
1996                          struct perf_event_header header;
1997                          struct sample_id sample_id;
1998                      };
1999
2000              PERF_RECORD_SWITCH_CPU_WIDE (since Linux 4.3)
2001                  As with PERF_RECORD_SWITCH this record indicates  a  context
2002                  switch  has  happened,  but  it only occurs when sampling in
2003                  CPU-wide mode and provides  additional  information  on  the
2004                  process       being       switched       to/from.        The
2005                  PERF_RECORD_MISC_SWITCH_OUT bit in the misc field  indicates
2006                  whether  it  was a context switch into or away from the cur‐
2007                  rent process.
2008
2009                      struct {
2010                          struct perf_event_header header;
2011                          u32 next_prev_pid;
2012                          u32 next_prev_tid;
2013                          struct sample_id sample_id;
2014                      };
2015
2016                  next_prev_pid
2017                         The process ID of the previous (if switching  in)  or
2018                         next (if switching out) process on the CPU.
2019
2020                  next_prev_tid
2021                         The  thread  ID  of the previous (if switching in) or
2022                         next (if switching out) thread on the CPU.
2023
2024              PERF_RECORD_NAMESPACES (since Linux 4.11)
2025                  This record includes  various  namespace  information  of  a
2026                  process.
2027
2028                      struct {
2029                          struct perf_event_header header;
2030                          u32    pid;
2031                          u32    tid;
2032                          u64    nr_namespaces;
2033                          struct { u64 dev, inode } [nr_namespaces];
2034                          struct sample_id sample_id;
2035                      };
2036
2037                  pid    is the process ID
2038
2039                  tid    is the thread ID
2040
2041                  nr_namespace
2042                         is the number of namespaces in this record
2043
2044                  Each  namespace  has dev and inode fields and is recorded in
2045                  the fixed position like below:
2046
2047                  NET_NS_INDEX=0
2048                         Network namespace
2049
2050                  UTS_NS_INDEX=1
2051                         UTS namespace
2052
2053                  IPC_NS_INDEX=2
2054                         IPC namespace
2055
2056                  PID_NS_INDEX=3
2057                         PID namespace
2058
2059                  USER_NS_INDEX=4
2060                         User namespace
2061
2062                  MNT_NS_INDEX=5
2063                         Mount namespace
2064
2065                  CGROUP_NS_INDEX=6
2066                         Cgroup namespace
2067
2068              PERF_RECORD_KSYMBOL (since Linux 5.0)
2069                  This  record  indicates  kernel  symbol  register/unregister
2070                  events.
2071
2072                      struct {
2073                          struct perf_event_header header;
2074                          u64    addr;
2075                          u32    len;
2076                          u16    ksym_type;
2077                          u16    flags;
2078                          char   name[];
2079                          struct sample_id sample_id;
2080                      };
2081
2082                  addr   is the address of the kernel symbol.
2083
2084                  len    is the length of the kernel symbol.
2085
2086                  ksym_type
2087                         is the type of the kernel symbol.  Currently the fol‐
2088                         lowing types are available:
2089
2090                         PERF_RECORD_KSYMBOL_TYPE_BPF
2091                                The kernel symbol is a BPF function.
2092
2093                  flags  If the PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER  is  set,
2094                         then  this event is for unregistering the kernel sym‐
2095                         bol.
2096
2097              PERF_RECORD_BPF_EVENT (since Linux 5.0)
2098                  This record indicates BPF program is loaded or unloaded.
2099
2100                      struct {
2101                          struct perf_event_header header;
2102                          u16 type;
2103                          u16 flags;
2104                          u32 id;
2105                          u8 tag[BPF_TAG_SIZE];
2106                          struct sample_id sample_id;
2107                      };
2108
2109                  type   is one of the following values:
2110
2111                         PERF_BPF_EVENT_PROG_LOAD
2112                                A BPF program is loaded
2113
2114                         PERF_BPF_EVENT_PROG_UNLOAD
2115                                A BPF program is unloaded
2116
2117                  id     is the ID of the BPF program.
2118
2119                  tag    is  the  tag  of   the   BPF   program.    Currently,
2120                         BPF_TAG_SIZE is defined as 8.
2121
2122              PERF_RECORD_CGROUP (since Linux 5.7)
2123                  This record indicates a new cgroup is created and activated.
2124
2125                      struct {
2126                          struct perf_event_header header;
2127                          u64    id;
2128                          char   path[];
2129                          struct sample_id sample_id;
2130                      };
2131
2132                  id     is the cgroup identifier.  This can be also retrieved
2133                         by name_to_handle_at(2) on the cgroup path (as a file
2134                         handle).
2135
2136                  path   is the path of the cgroup from the root.
2137
2138              PERF_RECORD_TEXT_POKE (since Linux 5.8)
2139                  This record indicates a change in the kernel text.  This in‐
2140                  cludes addition and removal of the text and the  correspond‐
2141                  ing length is zero in this case.
2142
2143                      struct {
2144                          struct perf_event_header header;
2145                          u64    addr;
2146                          u16    old_len;
2147                          u16    new_len;
2148                          u8     bytes[];
2149                          struct sample_id sample_id;
2150                      };
2151
2152                  addr   is the address of the change
2153
2154                  old_len
2155                         is the old length
2156
2157                  new_len
2158                         is the new length
2159
2160                  bytes  contains old bytes immediately followed by new bytes.
2161
2162   Overflow handling
2163       Events  can be set to notify when a threshold is crossed, indicating an
2164       overflow.  Overflow conditions can be captured by monitoring the  event
2165       file  descriptor  with poll(2), select(2), or epoll(7).  Alternatively,
2166       the overflow events can be captured via sa signal handler, by  enabling
2167       I/O  signaling  on the file descriptor; see the discussion of the F_SE‐
2168       TOWN and F_SETSIG operations in fcntl(2).
2169
2170       Overflows are generated only by  sampling  events  (sample_period  must
2171       have a nonzero value).
2172
2173       There are two ways to generate overflow notifications.
2174
2175       The first is to set a wakeup_events or wakeup_watermark value that will
2176       trigger if a certain number of samples or bytes have  been  written  to
2177       the mmap ring buffer.  In this case, POLL_IN is indicated.
2178
2179       The  other  way  is  by  use of the PERF_EVENT_IOC_REFRESH ioctl.  This
2180       ioctl adds to a counter that decrements each time the event  overflows.
2181       When  nonzero,  POLL_IN  is  indicated,  but once the counter reaches 0
2182       POLL_HUP is indicated and the underlying event is disabled.
2183
2184       Refreshing an event group leader refreshes all siblings and  refreshing
2185       with  a  parameter of 0 currently enables infinite refreshes; these be‐
2186       haviors are unsupported and should not be relied on.
2187
2188       Starting with Linux 3.18, POLL_HUP is indicated if the event being mon‐
2189       itored is attached to a different process and that process exits.
2190
2191   rdpmc instruction
2192       Starting  with  Linux  3.4 on x86, you can use the rdpmc instruction to
2193       get low-latency reads without having to enter the  kernel.   Note  that
2194       using  rdpmc  is  not necessarily faster than other methods for reading
2195       event values.
2196
2197       Support for this can be detected with the cap_usr_rdpmc  field  in  the
2198       mmap  page; documentation on how to calculate event values can be found
2199       in that section.
2200
2201       Originally, when rdpmc support was enabled, any process (not just  ones
2202       with  an  active  perf event) could use the rdpmc instruction to access
2203       the counters.  Starting with Linux 4.0, rdpmc support is  only  allowed
2204       if  an  event  is currently enabled in a process's context.  To restore
2205       the old behavior, write the value 2 to /sys/devices/cpu/rdpmc.
2206
2207   perf_event ioctl calls
2208       Various ioctls act on perf_event_open() file descriptors:
2209
2210       PERF_EVENT_IOC_ENABLE
2211              This enables the individual event or event  group  specified  by
2212              the file descriptor argument.
2213
2214              If  the  PERF_IOC_FLAG_GROUP  bit  is set in the ioctl argument,
2215              then all events in a group are enabled, even if the event speci‐
2216              fied is not the group leader (but see BUGS).
2217
2218       PERF_EVENT_IOC_DISABLE
2219              This disables the individual counter or event group specified by
2220              the file descriptor argument.
2221
2222              Enabling or disabling the leader of a group enables or  disables
2223              the  entire  group; that is, while the group leader is disabled,
2224              none of the counters in the group will count.  Enabling or  dis‐
2225              abling  a  member  of a group other than the leader affects only
2226              that counter; disabling a non-leader  stops  that  counter  from
2227              counting but doesn't affect any other counter.
2228
2229              If  the  PERF_IOC_FLAG_GROUP  bit  is set in the ioctl argument,
2230              then all events in a group are disabled, even if the event spec‐
2231              ified is not the group leader (but see BUGS).
2232
2233       PERF_EVENT_IOC_REFRESH
2234              Non-inherited overflow counters can use this to enable a counter
2235              for a number of overflows specified by the argument, after which
2236              it is disabled.  Subsequent calls of this ioctl add the argument
2237              value to the  current  count.   An  overflow  notification  with
2238              POLL_IN set will happen on each overflow until the count reaches
2239              0; when that happens a notification with POLL_HUP  set  is  sent
2240              and the event is disabled.  Using an argument of 0 is considered
2241              undefined behavior.
2242
2243       PERF_EVENT_IOC_RESET
2244              Reset the event count specified by the file descriptor  argument
2245              to  zero.  This resets only the counts; there is no way to reset
2246              the multiplexing time_enabled or time_running values.
2247
2248              If the PERF_IOC_FLAG_GROUP bit is set  in  the  ioctl  argument,
2249              then  all  events in a group are reset, even if the event speci‐
2250              fied is not the group leader (but see BUGS).
2251
2252       PERF_EVENT_IOC_PERIOD
2253              This updates the overflow period for the event.
2254
2255              Since Linux 3.7 (on ARM) and Linux  3.14  (all  other  architec‐
2256              tures),  the new period takes effect immediately.  On older ker‐
2257              nels, the new period did not take effect until  after  the  next
2258              overflow.
2259
2260              The  argument  is a pointer to a 64-bit value containing the de‐
2261              sired new period.
2262
2263              Prior to Linux 2.6.36, this ioctl always failed due to a bug  in
2264              the kernel.
2265
2266       PERF_EVENT_IOC_SET_OUTPUT
2267              This tells the kernel to report event notifications to the spec‐
2268              ified file descriptor rather than the default one.  The file de‐
2269              scriptors must all be on the same CPU.
2270
2271              The  argument  specifies  the  desired file descriptor, or -1 if
2272              output should be ignored.
2273
2274       PERF_EVENT_IOC_SET_FILTER (since Linux 2.6.33)
2275              This adds an ftrace filter to this event.
2276
2277              The argument is a pointer to the desired ftrace filter.
2278
2279       PERF_EVENT_IOC_ID (since Linux 3.12)
2280              This returns the event ID value for the  given  event  file  de‐
2281              scriptor.
2282
2283              The  argument  is a pointer to a 64-bit unsigned integer to hold
2284              the result.
2285
2286       PERF_EVENT_IOC_SET_BPF (since Linux 4.1)
2287              This allows attaching a Berkeley Packet Filter (BPF) program  to
2288              an  existing  kprobe  tracepoint  event.   You  need CAP_PERFMON
2289              (since Linux 5.8) or CAP_SYS_ADMIN privileges to use this ioctl.
2290
2291              The argument is a BPF program file descriptor that  was  created
2292              by a previous bpf(2) system call.
2293
2294       PERF_EVENT_IOC_PAUSE_OUTPUT (since Linux 4.7)
2295              This  allows  pausing  and  resuming the event's ring-buffer.  A
2296              paused ring-buffer does not prevent generation of  samples,  but
2297              simply  discards  them.   The  discarded  samples are considered
2298              lost, and cause a PERF_RECORD_LOST sample to be  generated  when
2299              possible.  An overflow signal may still be triggered by the dis‐
2300              carded sample even though the ring-buffer remains empty.
2301
2302              The argument is an unsigned 32-bit  integer.   A  nonzero  value
2303              pauses the ring-buffer, while a zero value resumes the ring-buf‐
2304              fer.
2305
2306       PERF_EVENT_MODIFY_ATTRIBUTES (since Linux 4.17)
2307              This allows modifying an existing event without the overhead  of
2308              closing  and reopening a new event.  Currently this is supported
2309              only for breakpoint events.
2310
2311              The argument is a pointer to a  perf_event_attr  structure  con‐
2312              taining the updated event settings.
2313
2314       PERF_EVENT_IOC_QUERY_BPF (since Linux 4.16)
2315              This allows querying which Berkeley Packet Filter (BPF) programs
2316              are attached to an existing kprobe tracepoint.  You can only at‐
2317              tach one BPF program per event, but you can have multiple events
2318              attached to a tracepoint.  Querying this value on one tracepoint
2319              event  returns the ID of all BPF programs in all events attached
2320              to the tracepoint.  You need CAP_PERFMON (since  Linux  5.8)  or
2321              CAP_SYS_ADMIN privileges to use this ioctl.
2322
2323              The argument is a pointer to a structure
2324                  struct perf_event_query_bpf {
2325                      __u32    ids_len;
2326                      __u32    prog_cnt;
2327                      __u32    ids[0];
2328                  };
2329
2330              The  ids_len  field  indicates the number of ids that can fit in
2331              the provided ids array.  The prog_cnt value is filled in by  the
2332              kernel  with the number of attached BPF programs.  The ids array
2333              is filled with the ID of each attached BPF  program.   If  there
2334              are  more  programs  than will fit in the array, then the kernel
2335              will return ENOSPC and ids_len will indicate the number of  pro‐
2336              gram IDs that were successfully copied.
2337
2338   Using prctl(2)
2339       A  process  can enable or disable all currently open event groups using
2340       the prctl(2) PR_TASK_PERF_EVENTS_ENABLE and PR_TASK_PERF_EVENTS_DISABLE
2341       operations.  This applies only to events created locally by the calling
2342       process.  This does not apply to events created by other processes  at‐
2343       tached  to  the  calling  process  or  inherited  events  from a parent
2344       process.  Only group leaders are enabled and disabled,  not  any  other
2345       members of the groups.
2346
2347   perf_event related configuration files
2348       Files in /proc/sys/kernel/
2349
2350           /proc/sys/kernel/perf_event_paranoid
2351                  The  perf_event_paranoid  file can be set to restrict access
2352                  to the performance counters.
2353
2354                  2      allow only  user-space  measurements  (default  since
2355                         Linux 4.6).
2356                  1      allow  both kernel and user measurements (default be‐
2357                         fore Linux 4.6).
2358                  0      allow access to CPU-specific data but not raw  trace‐
2359                         point samples.
2360                  -1     no restrictions.
2361
2362                  The  existence  of the perf_event_paranoid file is the offi‐
2363                  cial  method  for   determining   if   a   kernel   supports
2364                  perf_event_open().
2365
2366           /proc/sys/kernel/perf_event_max_sample_rate
2367                  This  sets  the  maximum sample rate.  Setting this too high
2368                  can allow users to sample at a rate that impacts overall ma‐
2369                  chine  performance and potentially lock up the machine.  The
2370                  default value is 100000 (samples per second).
2371
2372           /proc/sys/kernel/perf_event_max_stack
2373                  This file sets the maximum depth of stack frame entries  re‐
2374                  ported when generating a call trace.
2375
2376           /proc/sys/kernel/perf_event_mlock_kb
2377                  Maximum  number  of pages an unprivileged user can mlock(2).
2378                  The default is 516 (kB).
2379
2380       Files in /sys/bus/event_source/devices/
2381
2382           Since Linux 2.6.34, the kernel supports having multiple PMUs avail‐
2383           able  for monitoring.  Information on how to program these PMUs can
2384           be found under /sys/bus/event_source/devices/.   Each  subdirectory
2385           corresponds to a different PMU.
2386
2387           /sys/bus/event_source/devices/*/type (since Linux 2.6.38)
2388                  This  contains an integer that can be used in the type field
2389                  of perf_event_attr to indicate that you  wish  to  use  this
2390                  PMU.
2391
2392           /sys/bus/event_source/devices/cpu/rdpmc (since Linux 3.4)
2393                  If this file is 1, then direct user-space access to the per‐
2394                  formance counter registers is allowed via the rdpmc instruc‐
2395                  tion.  This can be disabled by echoing 0 to the file.
2396
2397                  As  of  Linux  4.0  the  behavior has changed, so that 1 now
2398                  means only  allow  access  to  processes  with  active  perf
2399                  events, with 2 indicating the old allow-anyone-access behav‐
2400                  ior.
2401
2402           /sys/bus/event_source/devices/*/format/ (since Linux 3.4)
2403                  This subdirectory contains information on the  architecture-
2404                  specific  subfields  available  for  programming the various
2405                  config fields in the perf_event_attr struct.
2406
2407                  The content of each file is the name of  the  config  field,
2408                  followed  by  a  colon,  followed by a series of integer bit
2409                  ranges separated by commas.  For example, the file event may
2410                  contain  the  value  config1:1,6-10,44  which indicates that
2411                  event is an attribute that occupies bits 1,6–10, and  44  of
2412                  perf_event_attr::config1.
2413
2414           /sys/bus/event_source/devices/*/events/ (since Linux 3.4)
2415                  This  subdirectory  contains  files  with predefined events.
2416                  The contents are strings describing the event  settings  ex‐
2417                  pressed  in terms of the fields found in the previously men‐
2418                  tioned ./format/ directory.  These are not necessarily  com‐
2419                  plete  lists of all events supported by a PMU, but usually a
2420                  subset of events deemed useful or interesting.
2421
2422                  The content of each file is a list of attribute names  sepa‐
2423                  rated  by  commas.  Each entry has an optional value (either
2424                  hex or decimal).  If no value is specified, then it  is  as‐
2425                  sumed  to be a single-bit field with a value of 1.  An exam‐
2426                  ple entry may look like this: event=0x2,inv,ldlat=3.
2427
2428           /sys/bus/event_source/devices/*/uevent
2429                  This file is the standard kernel device  interface  for  in‐
2430                  jecting hotplug events.
2431
2432           /sys/bus/event_source/devices/*/cpumask (since Linux 3.7)
2433                  The cpumask file contains a comma-separated list of integers
2434                  that indicate a representative CPU number  for  each  socket
2435                  (package)  on  the motherboard.  This is needed when setting
2436                  up uncore or  northbridge  events,  as  those  PMUs  present
2437                  socket-wide events.
2438

RETURN VALUE

2440       On  success, perf_event_open() returns the new file descriptor.  On er‐
2441       ror, -1 is returned and errno is set to indicate the error.
2442

ERRORS

2444       The errors returned by perf_event_open() can be inconsistent,  and  may
2445       vary across processor architectures and performance monitoring units.
2446
2447       E2BIG  Returned if the perf_event_attr size value is too small (smaller
2448              than PERF_ATTR_SIZE_VER0), too big (larger than the page  size),
2449              or  larger  than the kernel supports and the extra bytes are not
2450              zero.  When E2BIG is returned, the perf_event_attr size field is
2451              overwritten by the kernel to be the size of the structure it was
2452              expecting.
2453
2454       EACCES Returned when the requested event  requires  CAP_PERFMON  (since
2455              Linux  5.8)  or  CAP_SYS_ADMIN permissions (or a more permissive
2456              perf_event paranoid setting).  Some common cases  where  an  un‐
2457              privileged  process  may  encounter  this  error: attaching to a
2458              process owned by a different user; monitoring all processes on a
2459              given  CPU  (i.e.,  specifying  the pid argument as -1); and not
2460              setting exclude_kernel when the paranoid setting requires it.
2461
2462       EBADF  Returned if the group_fd file descriptor is not  valid,  or,  if
2463              PERF_FLAG_PID_CGROUP  is  set, the cgroup file descriptor in pid
2464              is not valid.
2465
2466       EBUSY (since Linux 4.1)
2467              Returned if another event already has exclusive  access  to  the
2468              PMU.
2469
2470       EFAULT Returned  if  the  attr  pointer points at an invalid memory ad‐
2471              dress.
2472
2473       EINTR  Returned when trying to mix perf and ftrace handling for  a  up‐
2474              robe.
2475
2476       EINVAL Returned if the specified event is invalid.  There are many pos‐
2477              sible reasons for this.  A not-exhaustive list:  sample_freq  is
2478              higher than the maximum setting; the cpu to monitor does not ex‐
2479              ist; read_format is out of range; sample_type is out  of  range;
2480              the flags value is out of range; exclusive or pinned set and the
2481              event is not a group leader; the event config values are out  of
2482              range  or  set  reserved bits; the generic event selected is not
2483              supported; or there is not  enough  room  to  add  the  selected
2484              event.
2485
2486       EMFILE Each  opened  event uses one file descriptor.  If a large number
2487              of events are opened, the per-process limit  on  the  number  of
2488              open file descriptors will be reached, and no more events can be
2489              created.
2490
2491       ENODEV Returned when the event involves a feature not supported by  the
2492              current CPU.
2493
2494       ENOENT Returned  if  the type setting is not valid.  This error is also
2495              returned for some unsupported generic events.
2496
2497       ENOSPC Prior to Linux 3.3, if there was not enough room for the  event,
2498              ENOSPC  was returned.  In Linux 3.3, this was changed to EINVAL.
2499              ENOSPC is still returned if  you  try  to  add  more  breakpoint
2500              events than supported by the hardware.
2501
2502       ENOSYS Returned  if PERF_SAMPLE_STACK_USER is set in sample_type and it
2503              is not supported by hardware.
2504
2505       EOPNOTSUPP
2506              Returned if an event requiring a specific  hardware  feature  is
2507              requested  but  there is no hardware support.  This includes re‐
2508              questing low-skid events if not supported, branch tracing if  it
2509              is not available, sampling if no PMU interrupt is available, and
2510              branch stacks for software events.
2511
2512       EOVERFLOW (since Linux 4.8)
2513              Returned  if  PERF_SAMPLE_CALLCHAIN  is   requested   and   sam‐
2514              ple_max_stack   is   larger   than   the  maximum  specified  in
2515              /proc/sys/kernel/perf_event_max_stack.
2516
2517       EPERM  Returned on many (but not all) architectures when an unsupported
2518              exclude_hv,  exclude_idle,  exclude_user, or exclude_kernel set‐
2519              ting is specified.
2520
2521              It can also happen, as with EACCES, when the requested event re‐
2522              quires  CAP_PERFMON  (since  Linux 5.8) or CAP_SYS_ADMIN permis‐
2523              sions (or a more permissive perf_event paranoid setting).   This
2524              includes  setting  a  breakpoint on a kernel address, and (since
2525              Linux 3.13) setting a kernel function-trace tracepoint.
2526
2527       ESRCH  Returned if attempting to attach to a process that does not  ex‐
2528              ist.
2529

STANDARDS

2531       Linux.
2532

HISTORY

2534       perf_event_open()  was  introduced  in  Linux  2.6.31  but  was  called
2535       perf_counter_open().  It was renamed in Linux 2.6.32.
2536

NOTES

2538       The official way of knowing if perf_event_open() support is enabled  is
2539       checking    for    the    existence    of   the   file   /proc/sys/ker‐
2540       nel/perf_event_paranoid.
2541
2542       CAP_PERFMON capability (since Linux 5.8) provides  secure  approach  to
2543       performance monitoring and observability operations in a system accord‐
2544       ing to the principal of least privilege (POSIX IEEE 1003.1e).   Access‐
2545       ing  system  performance  monitoring and observability operations using
2546       CAP_PERFMON rather than the much more powerful  CAP_SYS_ADMIN  excludes
2547       chances  to  misuse  credentials  and  makes  operations  more  secure.
2548       CAP_SYS_ADMIN usage for secure system performance  monitoring  and  ob‐
2549       servability is discouraged in favor of the CAP_PERFMON capability.
2550

BUGS

2552       The  F_SETOWN_EX  option to fcntl(2) is needed to properly get overflow
2553       signals in threads.  This was introduced in Linux 2.6.32.
2554
2555       Prior to Linux 2.6.33 (at least for x86), the kernel did not  check  if
2556       events  could  be scheduled together until read time.  The same happens
2557       on all known kernels if the NMI watchdog is enabled.  This means to see
2558       if  a  given  set of events works you have to perf_event_open(), start,
2559       then read before you know for sure you can get valid measurements.
2560
2561       Prior to Linux 2.6.34, event constraints were not enforced by the  ker‐
2562       nel.  In that case, some events would silently return "0" if the kernel
2563       scheduled them in an improper counter slot.
2564
2565       Prior to Linux 2.6.34, there was a  bug  when  multiplexing  where  the
2566       wrong results could be returned.
2567
2568       Kernels  from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel
2569       if "inherit" is enabled and many threads are started.
2570
2571       Prior to Linux 2.6.35, PERF_FORMAT_GROUP did  not  work  with  attached
2572       processes.
2573
2574       There  is  a  bug in the kernel code between Linux 2.6.36 and Linux 3.0
2575       that ignores the "watermark" field and acts as if  a  wakeup_event  was
2576       chosen if the union has a nonzero value in it.
2577
2578       From  Linux 2.6.31 to Linux 3.4, the PERF_IOC_FLAG_GROUP ioctl argument
2579       was broken and would repeatedly operate on the event  specified  rather
2580       than iterating across all sibling events in a group.
2581
2582       From  Linux  3.4 to Linux 3.11, the mmap cap_usr_rdpmc and cap_usr_time
2583       bits mapped to the same location.   Code  should  migrate  to  the  new
2584       cap_user_rdpmc and cap_user_time fields instead.
2585
2586       Always  double-check your results!  Various generalized events have had
2587       wrong values.  For example, retired branches measured the  wrong  thing
2588       on AMD machines until Linux 2.6.35.
2589

EXAMPLES

2591       The  following  is  a short example that measures the total instruction
2592       count of a call to printf(3).
2593
2594       #include <linux/perf_event.h>
2595       #include <stdio.h>
2596       #include <stdlib.h>
2597       #include <string.h>
2598       #include <sys/ioctl.h>
2599       #include <sys/syscall.h>
2600       #include <unistd.h>
2601
2602       static long
2603       perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
2604                       int cpu, int group_fd, unsigned long flags)
2605       {
2606           int ret;
2607
2608           ret = syscall(SYS_perf_event_open, hw_event, pid, cpu,
2609                         group_fd, flags);
2610           return ret;
2611       }
2612
2613       int
2614       main(void)
2615       {
2616           int                     fd;
2617           long long               count;
2618           struct perf_event_attr  pe;
2619
2620           memset(&pe, 0, sizeof(pe));
2621           pe.type = PERF_TYPE_HARDWARE;
2622           pe.size = sizeof(pe);
2623           pe.config = PERF_COUNT_HW_INSTRUCTIONS;
2624           pe.disabled = 1;
2625           pe.exclude_kernel = 1;
2626           pe.exclude_hv = 1;
2627
2628           fd = perf_event_open(&pe, 0, -1, -1, 0);
2629           if (fd == -1) {
2630              fprintf(stderr, "Error opening leader %llx\n", pe.config);
2631              exit(EXIT_FAILURE);
2632           }
2633
2634           ioctl(fd, PERF_EVENT_IOC_RESET, 0);
2635           ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
2636
2637           printf("Measuring instruction count for this printf\n");
2638
2639           ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
2640           read(fd, &count, sizeof(count));
2641
2642           printf("Used %lld instructions\n", count);
2643
2644           close(fd);
2645       }
2646

SEE ALSO

2648       perf(1), fcntl(2), mmap(2), open(2), prctl(2), read(2)
2649
2650       Documentation/admin-guide/perf-security.rst in the kernel source tree
2651
2652
2653
2654Linux man-pages 6.05              2023-05-03                perf_event_open(2)
Impressum