1PERF_EVENT_OPEN(2)         Linux Programmer's Manual        PERF_EVENT_OPEN(2)
2
3
4

NAME

6       perf_event_open - set up performance monitoring
7

SYNOPSIS

9       #include <linux/perf_event.h>
10       #include <linux/hw_breakpoint.h>
11
12       int perf_event_open(struct perf_event_attr *attr,
13                           pid_t pid, int cpu, int group_fd,
14                           unsigned long flags);
15
16       Note: There is no glibc wrapper for this system call; see NOTES.
17

DESCRIPTION

19       Given  a  list of parameters, perf_event_open() returns a file descrip‐
20       tor, for use in subsequent system calls  (read(2),  mmap(2),  prctl(2),
21       fcntl(2), etc.).
22
23       A  call to perf_event_open() creates a file descriptor that allows mea‐
24       suring performance information.  Each file  descriptor  corresponds  to
25       one  event  that  is measured; these can be grouped together to measure
26       multiple events simultaneously.
27
28       Events can be enabled and disabled in two ways: via  ioctl(2)  and  via
29       prctl(2).   When  an  event  is  disabled it does not count or generate
30       overflows but does continue to exist and maintain its count value.
31
32       Events come in two flavors: counting and sampled.  A counting event  is
33       one  that  is  used  for  counting  the aggregate number of events that
34       occur.  In general, counting event results are gathered with a  read(2)
35       call.   A  sampling  event periodically writes measurements to a buffer
36       that can then be accessed via mmap(2).
37
38   Arguments
39       The argument pid allows events to be attached to processes  in  various
40       ways.   If  pid is 0, measurements happen on the current thread, if pid
41       is greater than 0, the process indicated by pid is measured, and if pid
42       is -1, all processes are counted.
43
44       The  cpu  argument allows measurements to be specific to a CPU.  If cpu
45       is greater than or equal to 0, measurements are restricted to the spec‐
46       ified CPU; if cpu is -1, the events are measured on all CPUs.
47
48       Note that the combination of pid == -1 and cpu == -1 is not valid.
49
50       A  pid  > 0 and cpu == -1 setting measures per-process and follows that
51       process to whatever CPU the process  gets  scheduled  to.   Per-process
52       events can be created by any user.
53
54       A  pid == -1 and cpu >= 0 setting is per-CPU and measures all processes
55       on the specified CPU.  Per-CPU events need the CAP_SYS_ADMIN capability
56       or a /proc/sys/kernel/perf_event_paranoid value of less than 1.
57
58       The  group_fd  argument  allows  event  groups to be created.  An event
59       group has one event which is the group leader.  The leader  is  created
60       first,  with  group_fd = -1.  The rest of the group members are created
61       with subsequent perf_event_open() calls with group_fd being set to  the
62       fd  of  the  group  leader.  (A single event on its own is created with
63       group_fd = -1 and is considered to be a group with only 1 member.)   An
64       event  group  is  scheduled onto the CPU as a unit: it will be put onto
65       the CPU only if all of the events in the group can be put onto the CPU.
66       This  means  that  the  values of the member events can be meaningfully
67       compared, added, divided (to get ratios), etc., with each other,  since
68       they have counted events for the same set of executed instructions.
69
70       The flags argument is formed by ORing together zero or more of the fol‐
71       lowing values:
72
73       PERF_FLAG_FD_NO_GROUP
74              This flag allows creating an event as part of an event group but
75              having no group leader.  It is unclear why this is useful.
76
77       PERF_FLAG_FD_OUTPUT
78              This  flag  re-routes  the  output  from  an  event to the group
79              leader.
80
81       PERF_FLAG_PID_CGROUP (Since Linux 2.6.39).
82              This flag activates  per-container  system-wide  monitoring.   A
83              container is an abstraction that isolates a set of resources for
84              finer grain control (CPUs, memory, etc...).  In this  mode,  the
85              event  is  measured  only if the thread running on the monitored
86              CPU belongs to the designated container (cgroup).  The cgroup is
87              identified  by passing a file descriptor opened on its directory
88              in the cgroupfs filesystem.  For instance, if the cgroup to mon‐
89              itor   is   called  test,  then  a  file  descriptor  opened  on
90              /dev/cgroup/test (assuming cgroupfs is mounted  on  /dev/cgroup)
91              must  be  passed  as  the  pid  parameter.  cgroup monitoring is
92              available only for system-wide events and may therefore  require
93              extra permissions.
94
95       The  perf_event_attr structure provides detailed configuration informa‐
96       tion for the event being created.
97
98           struct perf_event_attr {
99               __u32     type;         /* Type of event */
100               __u32     size;         /* Size of attribute structure */
101               __u64     config;       /* Type-specific configuration */
102
103               union {
104                   __u64 sample_period;    /* Period of sampling */
105                   __u64 sample_freq;      /* Frequency of sampling */
106               };
107
108               __u64     sample_type;  /* Specifies values included in sample */
109               __u64     read_format;  /* Specifies values returned in read */
110
111               __u64     disabled       : 1,   /* off by default */
112                         inherit        : 1,   /* children inherit it */
113                         pinned         : 1,   /* must always be on PMU */
114                         exclusive      : 1,   /* only group on PMU */
115                         exclude_user   : 1,   /* don't count user */
116                         exclude_kernel : 1,   /* don't count kernel */
117                         exclude_hv     : 1,   /* don't count hypervisor */
118                         exclude_idle   : 1,   /* don't count when idle */
119                         mmap           : 1,   /* include mmap data */
120                         comm           : 1,   /* include comm data */
121                         freq           : 1,   /* use freq, not period */
122                         inherit_stat   : 1,   /* per task counts */
123                         enable_on_exec : 1,   /* next exec enables */
124                         task           : 1,   /* trace fork/exit */
125                         watermark      : 1,   /* wakeup_watermark */
126                         precise_ip     : 2,   /* skid constraint */
127                         mmap_data      : 1,   /* non-exec mmap data */
128                         sample_id_all  : 1,   /* sample_type all events */
129                         exclude_host   : 1,   /* don't count in host */
130                         exclude_guest  : 1,   /* don't count in guest */
131                         exclude_callchain_kernel : 1,
132                                               /* exclude kernel callchains */
133                         exclude_callchain_user   : 1,
134                                            /* exclude user callchains */
135                         __reserved_1   : 41;
136
137               union {
138                   __u32 wakeup_events;    /* wakeup every n events */
139                   __u32 wakeup_watermark; /* bytes before wakeup */
140               };
141
142               __u32     bp_type;          /* breakpoint type */
143
144               union {
145                   __u64 bp_addr;          /* breakpoint address */
146                   __u64 config1;          /* extension of config */
147               };
148
149               union {
150                   __u64 bp_len;           /* breakpoint length */
151                   __u64 config2;          /* extension of config1 */
152               };
153               __u64   branch_sample_type; /* enum perf_branch_sample_type */
154               __u64   sample_regs_user;   /* user regs to dump on samples */
155               __u32   sample_stack_user;  /* size of stack to dump on
156                                              samples */
157               __u32   __reserved_2;       /* Align to u64 */
158
159           };
160
161       The fields of the  perf_event_attr  structure  are  described  in  more
162       detail below:
163
164       type   This  field specifies the overall event type.  It has one of the
165              following values:
166
167              PERF_TYPE_HARDWARE
168                     This indicates one of the "generalized"  hardware  events
169                     provided  by the kernel.  See the config field definition
170                     for more details.
171
172              PERF_TYPE_SOFTWARE
173                     This indicates one of the  software-defined  events  pro‐
174                     vided  by  the  kernel  (even  if  no hardware support is
175                     available).
176
177              PERF_TYPE_TRACEPOINT
178                     This indicates a tracepoint provided by the kernel trace‐
179                     point infrastructure.
180
181              PERF_TYPE_HW_CACHE
182                     This  indicates  a hardware cache event.  This has a spe‐
183                     cial encoding, described in the config field definition.
184
185              PERF_TYPE_RAW
186                     This indicates a "raw" implementation-specific  event  in
187                     the config field.
188
189              PERF_TYPE_BREAKPOINT (Since Linux 2.6.33)
190                     This  indicates  a hardware breakpoint as provided by the
191                     CPU.   Breakpoints  can  be  read/write  accesses  to  an
192                     address as well as execution of an instruction address.
193
194              dynamic PMU
195                     Since  Linux 2.6.39, perf_event_open() can support multi‐
196                     ple PMUs.  To enable this, a value exported by the kernel
197                     can  be  used  in the type field to indicate which PMU to
198                     use.  The value to use can be found in the sysfs filesys‐
199                     tem:  there  is  a  subdirectory  per  PMU instance under
200                     /sys/bus/event_source/devices.   In  each   sub-directory
201                     there is a type file whose content is an integer that can
202                     be   used   in   the   type   field.     For    instance,
203                     /sys/bus/event_source/devices/cpu/type contains the value
204                     for the core CPU PMU, which is usually 4.
205
206       size   The size of the perf_event_attr structure  for  forward/backward
207              compatibility.  Set this using sizeof(struct perf_event_attr) to
208              allow the kernel to see the struct size at the time of  compila‐
209              tion.
210
211              The  related  define  PERF_ATTR_SIZE_VER0 is set to 64; this was
212              the size of the first published struct.  PERF_ATTR_SIZE_VER1  is
213              72,  corresponding  to  the  addition  of  breakpoints  in Linux
214              2.6.33.  PERF_ATTR_SIZE_VER2 is 80 corresponding to the addition
215              of  branch sampling in Linux 3.4.  PERF_ATR_SIZE_VER3 is 96 cor‐
216              responding  to  the  addition  of  sample_regs_user   and   sam‐
217              ple_stack_user in Linux 3.7.
218
219       config This  specifies  which  event  you want, in conjunction with the
220              type field.  The config1 and config2 fields are also taken  into
221              account  in  cases  where 64 bits is not enough to fully specify
222              the event.  The encoding of these fields are event dependent.
223
224              The most significant bit (bit 63) of config  signifies  CPU-spe‐
225              cific  (raw) counter configuration data; if the most significant
226              bit is unset, the next 7 bits are an event type and the rest  of
227              the bits are the event identifier.
228
229              There  are  various ways to set the config field that are depen‐
230              dent on the value of the previously described type field.   What
231              follows  are  various possible settings for config separated out
232              by type.
233
234              If type is PERF_TYPE_HARDWARE, we are measuring one of the  gen‐
235              eralized hardware CPU events.  Not all of these are available on
236              all platforms.  Set config to one of the following:
237
238                   PERF_COUNT_HW_CPU_CYCLES
239                          Total cycles.  Be wary of what  happens  during  CPU
240                          frequency scaling
241
242                   PERF_COUNT_HW_INSTRUCTIONS
243                          Retired  instructions.   Be  careful,  these  can be
244                          affected by various issues,  most  notably  hardware
245                          interrupt counts
246
247                   PERF_COUNT_HW_CACHE_REFERENCES
248                          Cache  accesses.   Usually this indicates Last Level
249                          Cache accesses but this may vary depending  on  your
250                          CPU.  This may include prefetches and coherency mes‐
251                          sages; again this depends on the design of your CPU.
252
253                   PERF_COUNT_HW_CACHE_MISSES
254                          Cache misses.  Usually  this  indicates  Last  Level
255                          Cache  misses;  this  is intended to be used in con‐
256                          junction  with  the   PERF_COUNT_HW_CACHE_REFERENCES
257                          event to calculate cache miss rates.
258
259                   PERF_COUNT_HW_BRANCH_INSTRUCTIONS
260                          Retired branch instructions.  Prior to Linux 2.6.34,
261                          this used the wrong event on AMD processors.
262
263                   PERF_COUNT_HW_BRANCH_MISSES
264                          Mispredicted branch instructions.
265
266                   PERF_COUNT_HW_BUS_CYCLES
267                          Bus  cycles,  which  can  be  different  from  total
268                          cycles.
269
270                   PERF_COUNT_HW_STALLED_CYCLES_FRONTEND (Since Linux 3.0)
271                          Stalled cycles during issue.
272
273                   PERF_COUNT_HW_STALLED_CYCLES_BACKEND (Since Linux 3.0)
274                          Stalled cycles during retirement.
275
276                   PERF_COUNT_HW_REF_CPU_CYCLES (Since Linux 3.3)
277                          Total cycles; not affected by CPU frequency scaling.
278
279              If  type is PERF_TYPE_SOFTWARE, we are measuring software events
280              provided by the kernel.  Set config to one of the following:
281
282                   PERF_COUNT_SW_CPU_CLOCK
283                          This reports the CPU clock, a  high-resolution  per-
284                          CPU timer.
285
286                   PERF_COUNT_SW_TASK_CLOCK
287                          This reports a clock count specific to the task that
288                          is running.
289
290                   PERF_COUNT_SW_PAGE_FAULTS
291                          This reports the number of page faults.
292
293                   PERF_COUNT_SW_CONTEXT_SWITCHES
294                          This counts context switches.  Until  Linux  2.6.34,
295                          these  were all reported as user-space events, after
296                          that they are reported as happening in the kernel.
297
298                   PERF_COUNT_SW_CPU_MIGRATIONS
299                          This reports the number of  times  the  process  has
300                          migrated to a new CPU.
301
302                   PERF_COUNT_SW_PAGE_FAULTS_MIN
303                          This  counts the number of minor page faults.  These
304                          did not require disk I/O to handle.
305
306                   PERF_COUNT_SW_PAGE_FAULTS_MAJ
307                          This counts the number of major page faults.   These
308                          required disk I/O to handle.
309
310                   PERF_COUNT_SW_ALIGNMENT_FAULTS (Since Linux 2.6.33)
311                          This  counts  the number of alignment faults.  These
312                          happen when unaligned memory  accesses  happen;  the
313                          kernel  can handle these but it reduces performance.
314                          This happens only on some  architectures  (never  on
315                          x86).
316
317                   PERF_COUNT_SW_EMULATION_FAULTS (Since Linux 2.6.33)
318                          This  counts  the  number  of emulation faults.  The
319                          kernel sometimes traps on unimplemented instructions
320                          and  emulates  them  for user space.  This can nega‐
321                          tively impact performance.
322
323              If type is PERF_TYPE_TRACEPOINT, then we  are  measuring  kernel
324              tracepoints.   The  value  to use in config can be obtained from
325              under debugfs tracing/events/*/*/id if ftrace is enabled in  the
326              kernel.
327
328              If  type is PERF_TYPE_HW_CACHE, then we are measuring a hardware
329              CPU cache event.  To calculate the appropriate config value  use
330              the following equation:
331
332                      (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
333                      (perf_hw_cache_op_result_id << 16)
334
335                  where perf_hw_cache_id is one of:
336
337                      PERF_COUNT_HW_CACHE_L1D
338                             for measuring Level 1 Data Cache
339
340                      PERF_COUNT_HW_CACHE_L1I
341                             for measuring Level 1 Instruction Cache
342
343                      PERF_COUNT_HW_CACHE_LL
344                             for measuring Last-Level Cache
345
346                      PERF_COUNT_HW_CACHE_DTLB
347                             for measuring the Data TLB
348
349                      PERF_COUNT_HW_CACHE_ITLB
350                             for measuring the Instruction TLB
351
352                      PERF_COUNT_HW_CACHE_BPU
353                             for measuring the branch prediction unit
354
355                      PERF_COUNT_HW_CACHE_NODE (Since Linux 3.0)
356                             for measuring local memory accesses
357
358                  and perf_hw_cache_op_id is one of
359
360                      PERF_COUNT_HW_CACHE_OP_READ
361                             for read accesses
362
363                      PERF_COUNT_HW_CACHE_OP_WRITE
364                             for write accesses
365
366                      PERF_COUNT_HW_CACHE_OP_PREFETCH
367                             for prefetch accesses
368
369                  and perf_hw_cache_op_result_id is one of
370
371                      PERF_COUNT_HW_CACHE_RESULT_ACCESS
372                             to measure accesses
373
374                      PERF_COUNT_HW_CACHE_RESULT_MISS
375                             to measure misses
376
377              If  type  is  PERF_TYPE_RAW, then a custom "raw" config value is
378              needed.  Most CPUs support events that are not  covered  by  the
379              "generalized"  events.   These  are  implementation defined; see
380              your CPU manual (for example the Intel Volume  3B  documentation
381              or  the  AMD  BIOS  and  Kernel  Developer  Guide).  The libpfm4
382              library can be used to translate from the name in the  architec‐
383              tural  manuals to the raw hex value perf_event_open() expects in
384              this field.
385
386              If type is PERF_TYPE_BREAKPOINT, then leave config set to  zero.
387              Its parameters are set in other places.
388
389       sample_period, sample_freq
390              A  "sampling" counter is one that generates an interrupt every N
391              events, where N is given by sample_period.  A  sampling  counter
392              has  sample_period  >  0.   When  an  overflow interrupt occurs,
393              requested data is recorded in the mmap buffer.  The  sample_type
394              field controls what data is recorded on each interrupt.
395
396              sample_freq can be used if you wish to use frequency rather than
397              period.  In this case you set the freq flag.   The  kernel  will
398              adjust  the sampling period to try and achieve the desired rate.
399              The rate of adjustment is a timer tick.
400
401       sample_type
402              The various bits in this field specify which values  to  include
403              in the sample.  They will be recorded in a ring-buffer, which is
404              available to user space using mmap(2).  The order in  which  the
405              values are saved in the sample are documented in the MMAP Layout
406              subsection below; it is not  the  enum  perf_event_sample_format
407              order.
408
409              PERF_SAMPLE_IP
410                     Records instruction pointer.
411
412              PERF_SAMPLE_TID
413                     Records the process and thread IDs.
414
415              PERF_SAMPLE_TIME
416                     Records a timestamp.
417
418              PERF_SAMPLE_ADDR
419                     Records an address, if applicable.
420
421              PERF_SAMPLE_READ
422                     Record counter values for all events in a group, not just
423                     the group leader.
424
425              PERF_SAMPLE_CALLCHAIN
426                     Records the callchain (stack backtrace).
427
428              PERF_SAMPLE_ID
429                     Records a unique ID for the opened event's group leader.
430
431              PERF_SAMPLE_CPU
432                     Records CPU number.
433
434              PERF_SAMPLE_PERIOD
435                     Records the current sampling period.
436
437              PERF_SAMPLE_STREAM_ID
438                     Records  a  unique  ID  for  the  opened  event.   Unlike
439                     PERF_SAMPLE_ID  the  actual ID is returned, not the group
440                     leader.  This ID is the  same  as  the  one  returned  by
441                     PERF_FORMAT_ID.
442
443              PERF_SAMPLE_RAW
444                     Records additional data, if applicable.  Usually returned
445                     by tracepoint events.
446
447              PERF_SAMPLE_BRANCH_STACK (Since Linux 3.4)
448                     This provides a record of recent branches, as provided by
449                     CPU  branch  sampling hardware (such as Intel Last Branch
450                     Record).  Not all hardware supports this feature.
451
452                     See the branch_sample_type field for how to filter  which
453                     branches are reported.
454
455              PERF_SAMPLE_REGS_USER (Since Linux 3.7)
456                     Records  the  current  user-level CPU register state (the
457                     values in the process before the kernel was called).
458
459              PERF_SAMPLE_STACK_USER (Since Linux 3.7)
460                     Records the user level stack, allowing stack unwinding.
461
462              PERF_SAMPLE_WEIGHT (Since Linux 3.10)
463                     Records a hardware provided weight value  that  expresses
464                     how  costly the sampled event was.  This allows the hard‐
465                     ware to highlight expensive events in a profile.
466
467              PERF_SAMPLE_DATA_SRC (Since Linux 3.10)
468                     Records the data source: where in  the  memory  hierarchy
469                     the  data  associated  with  the sampled instruction came
470                     from.  This is only available if the underlying  hardware
471                     supports this feature.
472
473       read_format
474              This  field specifies the format of the data returned by read(2)
475              on a perf_event_open() file descriptor.
476
477              PERF_FORMAT_TOTAL_TIME_ENABLED
478                     Adds the 64-bit time_enabled field.  This can be used  to
479                     calculate  estimated  totals  if the PMU is overcommitted
480                     and multiplexing is happening.
481
482              PERF_FORMAT_TOTAL_TIME_RUNNING
483                     Adds the 64-bit time_running field.  This can be used  to
484                     calculate  estimated  totals  if the PMU is overcommitted
485                     and  multiplexing is happening.
486
487              PERF_FORMAT_ID
488                     Adds a 64-bit unique value that corresponds to the  event
489                     group.
490
491              PERF_FORMAT_GROUP
492                     Allows  all  counter  values in an event group to be read
493                     with one read.
494
495       disabled
496              The disabled bit specifies whether the counter starts  out  dis‐
497              abled  or  enabled.  If disabled, the event can later be enabled
498              by ioctl(2), prctl(2), or enable_on_exec.
499
500       inherit
501              The inherit bit specifies that this counter should count  events
502              of child tasks as well as the task specified.  This applies only
503              to new children, not to any existing children at  the  time  the
504              counter  is  created  (nor to any new children of existing chil‐
505              dren).
506
507              Inherit does not work for  some  combinations  of  read_formats,
508              such as PERF_FORMAT_GROUP.
509
510       pinned The  pinned  bit  specifies that the counter should always be on
511              the CPU if at all possible.  It applies only to  hardware  coun‐
512              ters  and  only to group leaders.  If a pinned counter cannot be
513              put onto the CPU (e.g., because there are  not  enough  hardware
514              counters  or  because of a conflict with some other event), then
515              the counter goes into an 'error' state, where reads return  end-
516              of-file  (i.e.,  read(2)  returns 0) until the counter is subse‐
517              quently enabled or disabled.
518
519       exclusive
520              The exclusive bit specifies that when this counter's group is on
521              the  CPU,  it should be the only group using the CPU's counters.
522              In the future this may allow monitoring programs to support  PMU
523              features  that  need  to  run  alone so that they do not disrupt
524              other hardware counters.
525
526       exclude_user
527              If this bit is set, the count excludes  events  that  happen  in
528              user space.
529
530       exclude_kernel
531              If  this  bit  is  set, the count excludes events that happen in
532              kernel-space.
533
534       exclude_hv
535              If this bit is set, the count excludes events that happen in the
536              hypervisor.   This is mainly for PMUs that have built-in support
537              for handling this (such as POWER).  Extra support is needed  for
538              handling hypervisor measurements on most machines.
539
540       exclude_idle
541              If set, don't count when the CPU is idle.
542
543       mmap   The mmap bit enables recording of exec mmap events.
544
545       comm   The  comm  bit enables tracking of process command name as modi‐
546              fied by the exec(2) and prctl(PR_SET_NAME) system calls.  Unfor‐
547              tunately  for  tools,  there is no way to distinguish one system
548              call versus the other.
549
550       freq   If this bit is set, then sample_frequency not  sample_period  is
551              used when setting up the sampling interval.
552
553       inherit_stat
554              This  bit  enables  saving of event counts on context switch for
555              inherited tasks.  This is meaningful only if the  inherit  field
556              is set.
557
558       enable_on_exec
559              If  this  bit is set, a counter is automatically enabled after a
560              call to exec(2).
561
562       task   If this bit is set, then fork/exit notifications are included in
563              the ring buffer.
564
565       watermark
566              If  set,  have  a  sampling  interrupt  happen when we cross the
567              wakeup_watermark boundary.  Otherwise  interrupts  happen  after
568              wakeup_events samples.
569
570       precise_ip (Since Linux 2.6.35)
571              This controls the amount of skid.  Skid is how many instructions
572              execute between an event of interest happening  and  the  kernel
573              being able to stop and record the event.  Smaller skid is better
574              and allows more accurate reporting of which events correspond to
575              which instructions, but hardware is often limited with how small
576              this can be.
577
578              The values of this are the following:
579
580              0 -    SAMPLE_IP can have arbitrary skid
581
582              1 -    SAMPLE_IP must have constant skid
583
584              2 -    SAMPLE_IP requested to have 0 skid
585
586              3 -    SAMPLE_IP    must    have    0    skid.      See     also
587                     PERF_RECORD_MISC_EXACT_IP.
588
589       mmap_data (Since Linux 2.6.36)
590              The  counterpart  of  the mmap field, but enables including data
591              mmap events in the ring-buffer.
592
593       sample_id_all (Since Linux 2.6.38)
594              If set, then TID, TIME, ID, CPU, and STREAM_ID can  additionally
595              be included in non-PERF_RECORD_SAMPLEs if the corresponding sam‐
596              ple_type is selected.
597
598       exclude_host (Since Linux 3.2)
599              Do not measure time spent in VM host
600
601       exclude_guest (Since Linux 3.2)
602              Do not measure time spent in VM guest
603
604       exclude_callchain_kernel (Since Linux 3.7)
605              Do not include kernel callchains.
606
607       exclude_callchain_user (Since Linux 3.7)
608              Do not include user callchains.
609
610       wakeup_events, wakeup_watermark
611              This union  sets  how  many  samples  (wakeup_events)  or  bytes
612              (wakeup_watermark)  happen  before  an  overflow signal happens.
613              Which one is used is selected by the watermark bitflag.
614
615              wakeup_events only counts PERF_RECORD_SAMPLE record  types.   To
616              receive  a  signal  for  every  incoming  PERF_RECORD  type  set
617              wakeup_watermark to 1.
618
619       bp_type (Since Linux 2.6.33)
620              This chooses the breakpoint type.  It is one of:
621
622              HW_BREAKPOINT_EMPTY
623                     no breakpoint
624
625              HW_BREAKPOINT_R
626                     count when we read the memory location
627
628              HW_BREAKPOINT_W
629                     count when we write the memory location
630
631              HW_BREAKPOINT_RW
632                     count when we read or write the memory location
633
634              HW_BREAKPOINT_X
635                     count when we execute code at the memory location
636
637              The values can be combined via a bitwise or, but the combination
638              of  HW_BREAKPOINT_R  or  HW_BREAKPOINT_W with HW_BREAKPOINT_X is
639              not allowed.
640
641       bp_addr (Since Linux 2.6.33)
642              bp_addr address of the breakpoint.   For  execution  breakpoints
643              this  is  the memory address of the instruction of interest; for
644              read and write breakpoints it is the memory address of the  mem‐
645              ory location of interest.
646
647       config1 (Since Linux 2.6.39)
648              config1  is  used for setting events that need an extra register
649              or otherwise do not fit in the regular config field.   Raw  OFF‐
650              CORE_EVENTS  on  Nehalem/Westmere/SandyBridge  use this field on
651              3.3 and later kernels.
652
653       bp_len (Since Linux 2.6.33)
654              bp_len is the length of the breakpoint being measured if type is
655              PERF_TYPE_BREAKPOINT.     Options    are    HW_BREAKPOINT_LEN_1,
656              HW_BREAKPOINT_LEN_2,  HW_BREAKPOINT_LEN_4,  HW_BREAKPOINT_LEN_8.
657              For an execution breakpoint, set this to sizeof(long).
658
659       config2 (Since Linux 2.6.39)
660
661              config2 is a further extension of the config1 field.
662
663       branch_sample_type (Since Linux 3.4)
664              If PERF_SAMPLE_BRANCH_STACK is enabled, then this specifies what
665              branches to include in the branch record.  If the user does  not
666              set  privilege level explicitly, the kernel will use the event's
667              privilege level.  Event and branch privilege levels do not  have
668              to match.  The value is formed by ORing together zero or more of
669              the following values, although PERF_SAMPLE_BRANCH_ANY covers all
670              branch types.
671
672              PERF_SAMPLE_BRANCH_USER
673                     Branch target is in user space
674
675              PERF_SAMPLE_BRANCH_KERNEL
676                     Branch target is in kernel space
677
678              PERF_SAMPLE_BRANCH_HV
679                     Branch target is in hypervisor
680
681              PERF_SAMPLE_BRANCH_ANY
682                     Any branch type.
683
684              PERF_SAMPLE_BRANCH_ANY_CALL
685                     Any call branch
686
687              PERF_SAMPLE_BRANCH_ANY_RETURN
688                     Any return branch
689
690              PERF_SAMPLE_BRANCH_IND_CALL
691                     Indirect calls
692
693              PERF_SAMPLE_BRANCH_PLM_ALL
694                     User, kernel, and hv
695
696       sample_regs_user (Since Linux 3.7)
697              This  bitmask  defines  the set of user CPU registers to dump on
698              samples.  The layout of the register mask is  architecture  spe‐
699              cific     and     described     in     the     kernel     header
700              arch/ARCH/include/uapi/asm/perf_regs.h.
701
702       sample_stack_user (Since Linux 3.7)
703              This defines the size of the user stack  to  dump  if  PERF_SAM‐
704              PLE_STACK_USER is specified.
705
706   Reading results
707       Once  a  perf_event_open() file descriptor  has been opened, the values
708       of the events can be read from the file descriptor.   The  values  that
709       are  there are specified by the read_format field in the attr structure
710       at open time.
711
712       If you attempt to read into a buffer that is not big enough to hold the
713       data ENOSPC is returned
714
715       Here is the layout of the data returned by a read:
716
717       * If  PERF_FORMAT_GROUP  was specified to allow reading all events in a
718         group at once:
719
720             struct read_format {
721                 u64 nr;            /* The number of events */
722                 u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
723                 u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
724                 struct
725                     u64 value;     /* The value of the event */
726                     u64 id;        /* if PERF_FORMAT_ID */
727                 } values[nr];
728             };
729
730       * If PERF_FORMAT_GROUP was not specified:
731
732             struct read_format {
733                 u64 value;         /* The value of the event */
734                 u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
735                 u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
736                 u64 id;            /* if PERF_FORMAT_ID */
737             };
738
739       The values read are as follows:
740
741       nr     The number of events in this file descriptor.  Only available if
742              PERF_FORMAT_GROUP was specified.
743
744       time_enabled, time_running
745              Total  time  the  event was enabled and running.  Normally these
746              are the same.  If more events are started than available counter
747              slots  on the PMU, then multiplexing happens and events run only
748              part of the time.  In that case the time_enabled and  time  run‐
749              ning  values  can  be  used  to scale an estimated value for the
750              count.
751
752       value  An unsigned 64-bit value containing the counter result.
753
754       id     A globally unique value for this particular event, only there if
755              PERF_FORMAT_ID was specified in read_format.
756
757   MMAP layout
758       When using perf_event_open() in sampled mode, asynchronous events (like
759       counter overflow or PROT_EXEC mmap tracking) are logged  into  a  ring-
760       buffer.  This ring-buffer is created and accessed through mmap(2).
761
762       The mmap size should be 1+2^n pages, where the first page is a metadata
763       page (struct perf_event_mmap_page) that contains various bits of infor‐
764       mation such as where the ring-buffer head is.
765
766       Before  kernel  2.6.39,  there  is a bug that means you must allocate a
767       mmap ring buffer when sampling even if you do not plan to access it.
768
769       The structure of the first metadata mmap page is as follows:
770
771           struct perf_event_mmap_page {
772               __u32 version;          /* version number of this structure */
773               __u32 compat_version;   /* lowest version this is compat with */
774               __u32 lock;             /* seqlock for synchronization */
775               __u32 index;            /* hardware counter identifier */
776               __s64 offset;           /* add to hardware counter value */
777               __u64 time_enabled;     /* time event active */
778               __u64 time_running;     /* time event on CPU */
779               union {
780                   __u64   capabilities;
781                   __u64   cap_usr_time  : 1,
782                           cap_usr_rdpmc : 1,
783               };
784               __u16   pmc_width;
785               __u16   time_shift;
786               __u32   time_mult;
787               __u64   time_offset;
788               __u64   __reserved[120];   /* Pad to 1k */
789               __u64   data_head;         /* head in the data section */
790               __u64   data_tail;         /* user-space written tail */
791           }
792
793       The following looks at the fields in the perf_event_mmap_page structure
794       in more detail:
795
796       version
797              Version number of this structure.
798
799       compat_version
800              The lowest version this is compatible with.
801
802       lock   A seqlock for synchronization.
803
804       index  A unique hardware counter identifier.
805
806       offset Add this to hardware counter value??
807
808       time_enabled
809              Time the event was active.
810
811       time_running
812              Time the event was running.
813
814       cap_usr_time
815              User time capability
816
817       cap_usr_rdpmc
818              If the hardware supports user-space read of performance counters
819              without syscall (this is the "rdpmc" instruction on  x86),  then
820              the following code can be used to do a read:
821
822                  u32 seq, time_mult, time_shift, idx, width;
823                  u64 count, enabled, running;
824                  u64 cyc, time_offset;
825                  s64 pmc = 0;
826
827                  do {
828                      seq = pc->lock;
829                      barrier();
830                      enabled = pc->time_enabled;
831                      running = pc->time_running;
832
833                      if (pc->cap_usr_time && enabled != running) {
834                          cyc = rdtsc();
835                          time_offset = pc->time_offset;
836                          time_mult   = pc->time_mult;
837                          time_shift  = pc->time_shift;
838                      }
839
840                      idx = pc->index;
841                      count = pc->offset;
842
843                      if (pc->cap_usr_rdpmc && idx) {
844                          width = pc->pmc_width;
845                          pmc = rdpmc(idx - 1);
846                      }
847
848                      barrier();
849                  } while (pc->lock != seq);
850
851       pmc_width
852              If cap_usr_rdpmc, this field provides the bit-width of the value
853              read using the rdpmc or equivalent  instruction.   This  can  be
854              used to sign extend the result like:
855
856                  pmc <<= 64 - pmc_width;
857                  pmc >>= 64 - pmc_width; // signed shift right
858                  count += pmc;
859
860       time_shift, time_mult, time_offset
861
862              If  cap_usr_time,  these  fields can be used to compute the time
863              delta since time_enabled (in nanoseconds) using rdtsc  or  simi‐
864              lar.
865
866                  u64 quot, rem;
867                  u64 delta;
868                  quot = (cyc >> time_shift);
869                  rem = cyc & ((1 << time_shift) - 1);
870                  delta = time_offset + quot * time_mult +
871                          ((rem * time_mult) >> time_shift);
872
873              Where  time_offset,  time_mult,  time_shift, and cyc are read in
874              the seqcount loop described above.  This delta can then be added
875              to enabled and possible running (if idx), improving the scaling:
876
877                  enabled += delta;
878                  if (idx)
879                      running += delta;
880                  quot = count / running;
881                  rem  = count % running;
882                  count = quot * enabled + (rem * enabled) / running;
883
884       data_head
885              This points to the head of the data section.  The value continu‐
886              ously increases, it does not wrap.  The value needs to be  manu‐
887              ally wrapped by the size of the mmap buffer before accessing the
888              samples.
889
890              On SMP-capable platforms, after  reading  the  data_head  value,
891              user space should issue an rmb().
892
893       data_tail;
894              When  the  mapping  is PROT_WRITE, the data_tail value should be
895              written by user space to reflect the last read  data.   In  this
896              case the kernel will not over-write unread data.
897
898       The following 2^n ring-buffer pages have the layout described below.
899
900       If perf_event_attr.sample_id_all is set, then all event types will have
901       the sample_type selected fields related  to  where/when  (identity)  an
902       event   took  place  (TID,  TIME,  ID,  CPU,  STREAM_ID)  described  in
903       PERF_RECORD_SAMPLE  below,  it  will  be   stashed   just   after   the
904       perf_event_header  and  the  fields  already  present  for the existing
905       fields, i.e., at the end of the payload.  That way  a  newer  perf.data
906       file  will  be  supported  by older perf tools, with these new optional
907       fields being ignored.
908
909       The mmap values start with a header:
910
911           struct perf_event_header {
912               __u32   type;
913               __u16   misc;
914               __u16   size;
915           };
916
917       Below, we describe the perf_event_header fields in more detail.
918
919       type   The type value is one of the below.  The values  in  the  corre‐
920              sponding  record  (that  follows  the header) depend on the type
921              selected as shown.
922
923              PERF_RECORD_MMAP
924                  The MMAP events record the PROT_EXEC mappings so that we can
925                  correlate  user-space  IPs to code.  They have the following
926                  structure:
927
928                      struct {
929                          struct perf_event_header header;
930                          u32    pid, tid;
931                          u64    addr;
932                          u64    len;
933                          u64    pgoff;
934                          char   filename[];
935                      };
936
937              PERF_RECORD_LOST
938                  This record indicates when events are lost.
939
940                      struct {
941                          struct perf_event_header header;
942                          u64 id;
943                          u64 lost;
944                      };
945
946                  id     is the unique event ID  for  the  samples  that  were
947                         lost.
948
949                  lost   is the number of events that were lost.
950
951              PERF_RECORD_COMM
952                  This record indicates a change in the process name.
953
954                      struct {
955                          struct perf_event_header header;
956                          u32 pid, tid;
957                          char comm[];
958                      };
959
960              PERF_RECORD_EXIT
961                  This record indicates a process exit event.
962
963                      struct {
964                          struct perf_event_header header;
965                          u32 pid, ppid;
966                          u32 tid, ptid;
967                          u64 time;
968                      };
969
970              PERF_RECORD_THROTTLE, PERF_RECORD_UNTHROTTLE
971                  This record indicates a throttle/unthrottle event.
972
973                      struct {
974                          struct perf_event_header header;
975                          u64 time;
976                          u64 id;
977                          u64 stream_id;
978                      };
979
980              PERF_RECORD_FORK
981                  This record indicates a fork event.
982
983                      struct {
984                          struct perf_event_header header;
985                          u32 pid, ppid;
986                          u32 tid, ptid;
987                          u64 time;
988                      };
989
990              PERF_RECORD_READ
991                  This record indicates a read event.
992
993                      struct {
994                          struct perf_event_header header;
995                          u32 pid, tid;
996                          struct read_format values;
997                      };
998
999              PERF_RECORD_SAMPLE
1000                  This record indicates a sample.
1001
1002                      struct {
1003                          struct perf_event_header header;
1004                          u64   ip;         /* if PERF_SAMPLE_IP */
1005                          u32   pid, tid;   /* if PERF_SAMPLE_TID */
1006                          u64   time;       /* if PERF_SAMPLE_TIME */
1007                          u64   addr;       /* if PERF_SAMPLE_ADDR */
1008                          u64   id;         /* if PERF_SAMPLE_ID */
1009                          u64   stream_id;  /* if PERF_SAMPLE_STREAM_ID */
1010                          u32   cpu, res;   /* if PERF_SAMPLE_CPU */
1011                          u64   period;     /* if PERF_SAMPLE_PERIOD */
1012                          struct read_format v; /* if PERF_SAMPLE_READ */
1013                          u64   nr;         /* if PERF_SAMPLE_CALLCHAIN */
1014                          u64   ips[nr];    /* if PERF_SAMPLE_CALLCHAIN */
1015                          u32   size;       /* if PERF_SAMPLE_RAW */
1016                          char  data[size]; /* if PERF_SAMPLE_RAW */
1017                          u64   bnr;        /* if PERF_SAMPLE_BRANCH_STACK */
1018                          struct perf_branch_entry lbr[bnr];
1019                                            /* if PERF_SAMPLE_BRANCH_STACK */
1020                          u64   abi;        /* if PERF_SAMPLE_REGS_USER */
1021                          u64   regs[weight(mask)];
1022                                            /* if PERF_SAMPLE_REGS_USER */
1023                          u64   size;       /* if PERF_SAMPLE_STACK_USER */
1024                          char  data[size]; /* if PERF_SAMPLE_STACK_USER */
1025                          u64   dyn_size;   /* if PERF_SAMPLE_STACK_USER */
1026                          u64   weight;     /* if PERF_SAMPLE_WEIGHT */
1027                          u64   data_src;   /* if PERF_SAMPLE_DATA_SRC */
1028                      };
1029
1030                  ip     If  PERF_SAMPLE_IP is enabled, then a 64-bit instruc‐
1031                         tion pointer value is included.
1032
1033                  pid, tid
1034                         If PERF_SAMPLE_TID is enabled, then a 32-bit  process
1035                         ID and 32-bit thread ID are included.
1036
1037                  time   If  PERF_SAMPLE_TIME  is enabled, then a 64-bit time‐
1038                         stamp   is   included.    This   is   obtained    via
1039                         local_clock() which is a hardware timestamp if avail‐
1040                         able and the jiffies value if not.
1041
1042                  addr   If PERF_SAMPLE_ADDR is enabled, then a 64-bit address
1043                         is included.  This is usually the address of a trace‐
1044                         point, breakpoint, or software event;  otherwise  the
1045                         value is 0.
1046
1047                  id     If  PERF_SAMPLE_ID  is enabled, a 64-bit unique ID is
1048                         included.  If the event  is  a  member  of  an  event
1049                         group,  the  group leader ID is returned.  This ID is
1050                         the same as the one returned by PERF_FORMAT_ID.
1051
1052                  stream_id
1053                         If PERF_SAMPLE_STREAM_ID is enabled, a 64-bit  unique
1054                         ID  is included.  Unlike PERF_SAMPLE_ID the actual ID
1055                         is returned, not the group leader.  This  ID  is  the
1056                         same as the one returned by PERF_FORMAT_ID.
1057
1058                  cpu, res
1059                         If PERF_SAMPLE_CPU is enabled, this is a 32-bit value
1060                         indicating which CPU was being used, in addition to a
1061                         reserved (unused) 32-bit value.
1062
1063                  period If  PERF_SAMPLE_PERIOD  is  enabled,  a  64-bit value
1064                         indicating the current sampling period is written.
1065
1066                  v      If PERF_SAMPLE_READ is enabled, a structure  of  type
1067                         read_format  is  included  which  has  values for all
1068                         events in  the  event  group.   The  values  included
1069                         depend    on    the   read_format   value   used   at
1070                         perf_event_open() time.
1071
1072                  nr, ips[nr]
1073                         If PERF_SAMPLE_CALLCHAIN is enabled,  then  a  64-bit
1074                         number is included which indicates how many following
1075                         64-bit instruction pointers will follow.  This is the
1076                         current callchain.
1077
1078                  size, data[size]
1079                         If  PERF_SAMPLE_RAW  is  enabled, then a 32-bit value
1080                         indicating size is included followed by an  array  of
1081                         8-bit  values  of length size.  The values are padded
1082                         with 0 to have 64-bit alignment.
1083
1084                         This RAW record data is opaque with  respect  to  the
1085                         ABI.   The ABI doesn't make any promises with respect
1086                         to the stability of its content, it may vary  depend‐
1087                         ing on event, hardware, and kernel version.
1088
1089                  bnr, lbr[bnr]
1090                         If PERF_SAMPLE_BRANCH_STACK is enabled, then a 64-bit
1091                         value indicating the number of records  is  included,
1092                         followed  by  bnr  perf_branch_entry structures which
1093                         each include the fields:
1094
1095                         from   indicating the source instruction (may not  be
1096                                a branch)
1097
1098                         to     the branch target
1099
1100                         mispred
1101                                the branch target was mispredicted
1102
1103                         predicted
1104                                the branch target was predicted.
1105                  The  entries  are  from  most  to least recent, so the first
1106                  entry has the most recent branch.
1107
1108                  Support for mispred and predicted is optional; if  not  sup‐
1109                  ported, both values will be 0.
1110
1111
1112                  abi, regs[weight(mask)]
1113                         If  PERF_SAMPLE_REGS_USER  is  enabled, then the user
1114                         CPU registers are recorded.
1115
1116                         The abi field is  one  of  PERF_SAMPLE_REGS_ABI_NONE,
1117                         PERF_SAMPLE_REGS_ABI_32 or PERF_SAMPLE_REGS_ABI_64.
1118
1119                         The  regs field is an array of the CPU registers that
1120                         were specified by the  sample_regs_user  attr  field.
1121                         The number of values is the number of bits set in the
1122                         sample_regs_user bitmask.
1123
1124                  size, data[size], dyn_size
1125                         If PERF_SAMPLE_STACK_USER is enabled, then record the
1126                         user  stack  to enable backtracing.  size is the size
1127                         requested by the user in stack_user_size or else  the
1128                         maximum   record  size.   data  is  the  stack  data.
1129                         dyn_size is the amount of data actually  dumped  (can
1130                         be less than size).
1131
1132                  weight If PERF_SAMPLE_WEIGHT is enabled, then a 64 bit value
1133                         provided by the hardware is recorded  that  indicates
1134                         how  costly  the  event  was.   This allows expensive
1135                         events to stand out more clearly in profiles.
1136
1137                  data_src
1138                         If PERF_SAMPLE_DATA_SRC is enabled,  then  a  64  bit
1139                         value  is  recorded  that is made up of the following
1140                         fields:
1141
1142                         mem_op type  of  opcode,  a  bitwise  combination  of
1143                                PERF_MEM_OP_NA         (not        available),
1144                                PERF_MEM_OP_LOAD      (load      instruction),
1145                                PERF_MEM_OP_STORE     (store     instruction),
1146                                PERF_MEM_OP_PFETCH       (prefetch),       and
1147                                PERF_MEM_OP_EXEC (executable code).
1148
1149                         mem_lvl
1150                                memory  hierarchy level hit or miss, a bitwise
1151                                combination  of  PERF_MEM_LVL_NA  (not  avail‐
1152                                able),         PERF_MEM_LVL_HIT         (hit),
1153                                PERF_MEM_LVL_MISS   (miss),    PERF_MEM_LVL_L1
1154                                (level  1  cache), PERF_MEM_LVL_LFB (line fill
1155                                buffer),  PERF_MEM_LVL_L2  (level  2   cache),
1156                                PERF_MEM_LVL_L3      (level      3     cache),
1157                                PERF_MEM_LVL_LOC_RAM       (local       DRAM),
1158                                PERF_MEM_LVL_REM_RAM1  (remote  DRAM  1  hop),
1159                                PERF_MEM_LVL_REM_RAM2 (remote  DRAM  2  hops),
1160                                PERF_MEM_LVL_REM_CCE1  (remote  cache  1 hop),
1161                                PERF_MEM_LVL_REM_CCE2 (remote cache  2  hops),
1162                                PERF_MEM_LVL_IO      (I/O     memory),     and
1163                                PERF_MEM_LVL_UNC (uncached memory).
1164
1165                         mem_snoop
1166                                snoop   mode,   a   bitwise   combination   of
1167                                PERF_MEM_SNOOP_NA       (not       available),
1168                                PERF_MEM_SNOOP_NONE        (no         snoop),
1169                                PERF_MEM_SNOOP_HIT         (snoop        hit),
1170                                PERF_MEM_SNOOP_MISS    (snoop    miss),    and
1171                                PERF_MEM_SNOOP_HITM (snoop hit modified).
1172
1173                         mem_lock
1174                                lock  instruction,  a  bitwise  combination of
1175                                PERF_MEM_LOCK_NA    (not    available)     and
1176                                PERF_MEM_LOCK_LOCKED (locked transaction).
1177
1178                         mem_dtlb
1179                                tlb  access hit or miss, a bitwise combination
1180                                of    PERF_MEM_TLB_NA     (not     available),
1181                                PERF_MEM_TLB_HIT    (hit),   PERF_MEM_TLB_MISS
1182                                (miss),   PERF_MEM_TLB_L1   (level   1   TLB),
1183                                PERF_MEM_TLB_L2 (level 2 TLB), PERF_MEM_TLB_WK
1184                                (hardware  walker),  and  PERF_MEM_TLB_OS  (OS
1185                                fault handler).
1186
1187       misc   The misc field contains additional information about the sample.
1188
1189              The  CPU  mode can be determined from this value by masking with
1190              PERF_RECORD_MISC_CPUMODE_MASK and looking for one of the follow‐
1191              ing  (note  these  are  not  bit masks, only one can be set at a
1192              time):
1193
1194              PERF_RECORD_MISC_CPUMODE_UNKNOWN
1195                     Unknown CPU mode.
1196
1197              PERF_RECORD_MISC_KERNEL
1198                     Sample happened in the kernel.
1199
1200              PERF_RECORD_MISC_USER
1201                     Sample happened in user code.
1202
1203              PERF_RECORD_MISC_HYPERVISOR
1204                     Sample happened in the hypervisor.
1205
1206              PERF_RECORD_MISC_GUEST_KERNEL
1207                     Sample happened in the guest kernel.
1208
1209              PERF_RECORD_MISC_GUEST_USER
1210                     Sample happened in guest user code.
1211
1212              In addition, one of the following bits can be set:
1213
1214              PERF_RECORD_MISC_MMAP_DATA
1215                     This is set when the mapping is not executable; otherwise
1216                     the mapping is executable.
1217
1218              PERF_RECORD_MISC_EXACT_IP
1219                     This  indicates that the content of PERF_SAMPLE_IP points
1220                     to the actual instruction that triggered the event.   See
1221                     also perf_event_attr.precise_ip.
1222
1223              PERF_RECORD_MISC_EXT_RESERVED
1224                     This  indicates  there  is  extended data available (cur‐
1225                     rently not used).
1226
1227       size   This indicates the size of the record.
1228
1229   Signal overflow
1230       Events can be set to deliver a signal when a threshold is crossed.  The
1231       signal  handler  is  set  up using the poll(2), select(2), epoll(2) and
1232       fcntl(2), system calls.
1233
1234       To generate signals, sampling must be enabled (sample_period must  have
1235       a non-zero value).
1236
1237       There are two ways to generate signals.
1238
1239       The first is to set a wakeup_events or wakeup_watermark value that will
1240       generate a signal if a certain number of samples  or  bytes  have  been
1241       written to the mmap ring buffer.  In this case a signal of type POLL_IN
1242       is sent.
1243
1244       The other way is by use  of  the  PERF_EVENT_IOC_REFRESH  ioctl.   This
1245       ioctl  adds to a counter that decrements each time the event overflows.
1246       When non-zero, a POLL_IN signal is sent on overflow, but once the value
1247       reaches  0,  a signal is sent of type POLL_HUP and the underlying event
1248       is disabled.
1249
1250       Note: on newer kernels (definitely noticed with 3.2) a signal  is  pro‐
1251       vided for every overflow, even if wakeup_events is not set.
1252
1253   rdpmc instruction
1254       Starting  with  Linux  3.4 on x86, you can use the rdpmc instruction to
1255       get low-latency reads without having to enter the  kernel.   Note  that
1256       using  rdpmc  is  not necessarily faster than other methods for reading
1257       event values.
1258
1259       Support for this can be detected with the cap_usr_rdpmc  field  in  the
1260       mmap  page; documentation on how to calculate event values can be found
1261       in that section.
1262
1263   perf_event ioctl calls
1264       Various ioctls act on perf_event_open() file descriptors
1265
1266       PERF_EVENT_IOC_ENABLE
1267              Enables the individual event or event  group  specified  by  the
1268              file descriptor argument.
1269
1270              If  the  PERF_IOC_FLAG_GROUP  bit  is set in the ioctl argument,
1271              then all events in a group are enabled, even if the event speci‐
1272              fied is not the group leader (but see BUGS).
1273
1274       PERF_EVENT_IOC_DISABLE
1275              Disables  the individual counter or event group specified by the
1276              file descriptor argument.
1277
1278              Enabling or disabling the leader of a group enables or  disables
1279              the  entire  group; that is, while the group leader is disabled,
1280              none of the counters in the group will count.  Enabling or  dis‐
1281              abling  a  member  of a group other than the leader affects only
1282              that counter; disabling a non-leader  stops  that  counter  from
1283              counting but doesn't affect any other counter.
1284
1285              If  the  PERF_IOC_FLAG_GROUP  bit  is set in the ioctl argument,
1286              then all events in a group are disabled, even if the event spec‐
1287              ified is not the group leader (but see BUGS).
1288
1289       PERF_EVENT_IOC_REFRESH
1290              Non-inherited overflow counters can use this to enable a counter
1291              for a number of overflows specified by the argument, after which
1292              it is disabled.  Subsequent calls of this ioctl add the argument
1293              value to the current count.  A signal with POLL_IN set will hap‐
1294              pen  on  each overflow until the count reaches 0; when that hap‐
1295              pens a signal with POLL_HUP set is sent and the  event  is  dis‐
1296              abled.  Using an argument of 0 is considered undefined behavior.
1297
1298       PERF_EVENT_IOC_RESET
1299              Reset  the event count specified by the file descriptor argument
1300              to zero.  This resets only the counts; there is no way to  reset
1301              the multiplexing time_enabled or time_running values.
1302
1303              If  the  PERF_IOC_FLAG_GROUP  bit  is set in the ioctl argument,
1304              then all events in a group are reset, even if the  event  speci‐
1305              fied is not the group leader (but see BUGS).
1306
1307       PERF_EVENT_IOC_PERIOD
1308              IOC_PERIOD  is  the  command  to  update the period; it does not
1309              update the current period but instead defers until next.
1310
1311              The argument is a pointer  to  a  64-bit  value  containing  the
1312              desired new period.
1313
1314       PERF_EVENT_IOC_SET_OUTPUT
1315              This tells the kernel to report event notifications to the spec‐
1316              ified file descriptor rather than the  default  one.   The  file
1317              descriptors must all be on the same CPU.
1318
1319              The  argument  specifies  the  desired file descriptor, or -1 if
1320              output should be ignored.
1321
1322       PERF_EVENT_IOC_SET_FILTER (Since Linux 2.6.33)
1323              This adds an ftrace filter to this event.
1324
1325              The argument is a pointer to the desired ftrace filter.
1326
1327   Using prctl
1328       A process can enable or disable all the event groups that are  attached
1329       to    it    using    the    prctl(2)   PR_TASK_PERF_EVENTS_ENABLE   and
1330       PR_TASK_PERF_EVENTS_DISABLE operations.  This applies to  all  counters
1331       on  the current process, whether created by this process or by another,
1332       and does not affect any counters that this process has created on other
1333       processes.   It  enables  or  disables  only the group leaders, not any
1334       other members in the groups.
1335
1336   perf_event related configuration files
1337       Files in /proc/sys/kernel/
1338
1339           /proc/sys/kernel/perf_event_paranoid
1340
1341                  The perf_event_paranoid file can be set to  restrict  access
1342                  to the performance counters.
1343
1344                  2 - only allow user-space measurements
1345
1346                  1 - (default) allow both kernel and user measurements
1347
1348                  0 - allow access to CPU-specific data but not raw tracepoint
1349                  samples
1350
1351                  -1 - no restrictions
1352
1353                  The existence of the perf_event_paranoid file is  the  offi‐
1354                  cial   method   for   determining   if   a  kernel  supports
1355                  perf_event_open().
1356
1357           /proc/sys/kernel/perf_event_max_sample_rate
1358
1359                  This sets the maximum sample rate.  Setting  this  too  high
1360                  can  allow  users  to  sample at a rate that impacts overall
1361                  machine performance and potentially  lock  up  the  machine.
1362                  The default value is 100000 (samples per second).
1363
1364           /proc/sys/kernel/perf_event_mlock_kb
1365
1366                  Maximum number of pages an unprivileged user can mlock (2) .
1367                  The default is 516 (kB).
1368
1369       Files in /sys/bus/event_source/devices/
1370           Since Linux 2.6.34 the kernel supports having multiple PMUs  avail‐
1371           able  for monitoring.  Information on how to program these PMUs can
1372           be found under /sys/bus/event_source/devices/.   Each  subdirectory
1373           corresponds to a different PMU.
1374
1375           /sys/bus/event_source/devices/*/type (Since Linux 2.6.38)
1376                  This  contains an integer that can be used in the type field
1377                  of perf_event_attr to indicate you wish to use this PMU.
1378
1379           /sys/bus/event_source/devices/*/rdpmc (Since Linux 3.4)
1380                  If this file is 1, then direct user-space access to the per‐
1381                  formance counter registers is allowed via the rdpmc instruc‐
1382                  tion.  This can be disabled by echoing 0 to the file.
1383
1384           /sys/bus/event_source/devices/*/format/ (Since Linux 3.4)
1385                  This sub-directory contains information on the architecture-
1386                  specific  sub-fields  available  for programming the various
1387                  config fields in the perf_event_attr struct.
1388
1389                  The content of each file is the name of  the  config  field,
1390                  followed  by  a  colon,  followed by a series of integer bit
1391                  ranges separated by commas.  For example, the file event may
1392                  contain  the  value  config1:1,6-10,44  which indicates that
1393                  event is an attribute that occupies bits 1,6-10, and  44  of
1394                  perf_event_attr::config1.
1395
1396           /sys/bus/event_source/devices/*/events/ (Since Linux 3.4)
1397                  This  sub-directory  contains files with pre-defined events.
1398                  The contents  are  strings  describing  the  event  settings
1399                  expressed  in  terms  of  the fields found in the previously
1400                  mentioned ./format/ directory.  These  are  not  necessarily
1401                  complete lists of all events supported by a PMU, but usually
1402                  a subset of events deemed useful or interesting.
1403
1404                  The content of each file is a list of attribute names  sepa‐
1405                  rated  by  commas.  Each entry has an optional value (either
1406                  hex or decimal).  If  no  value  is  specified  than  it  is
1407                  assumed  to  be  a  single-bit  field with a value of 1.  An
1408                  example entry may look like this: event=0x2,inv,ldlat=3
1409
1410           /sys/bus/event_source/devices/*/uevent
1411                  This file  is  the  standard  kernel  device  interface  for
1412                  injecting hotplug events.
1413
1414           /sys/bus/event_source/devices/*/cpumask (Since Linux 3.7)
1415                  The cpumask file contains a comma-separated list of integers
1416                  that indicate a representative cpu number  for  each  socket
1417                  (package)  on  the motherboard.  This is needed when setting
1418                  up uncore or  northbridge  events,  as  those  PMUs  present
1419                  socket-wide events.
1420

RETURN VALUE

1422       perf_event_open()  returns  the  new file descriptor, or -1 if an error
1423       occurred (in which case, errno is set appropriately).
1424

ERRORS

1426       EINVAL Returned if the specified event is not available.
1427
1428       ENOSPC Prior to Linux 3.3, if there was not enough room for the  event,
1429              ENOSPC  was  returned.   Linus  did  not like this, and this was
1430              changed to EINVAL.  ENOSPC is still returned if you try to  read
1431              results into too small of a buffer.
1432

VERSION

1434       perf_event_open()  was  introduced  in  Linux  2.6.31  but  was  called
1435       perf_counter_open().  It was renamed in Linux 2.6.32.
1436

CONFORMING TO

1438       This perf_event_open() system call Linux- specific and  should  not  be
1439       used in programs intended to be portable.
1440

NOTES

1442       Glibc  does  not  provide a wrapper for this system call; call it using
1443       syscall(2).  See the example below.
1444
1445       The official way of knowing if perf_event_open() support is enabled  is
1446       checking    for    the    existence    of   the   file   /proc/sys/ker‐
1447       nel/perf_event_paranoid.
1448

BUGS

1450       The F_SETOWN_EX option to fcntl(2) is needed to properly  get  overflow
1451       signals in threads.  This was introduced in Linux 2.6.32.
1452
1453       Prior  to  Linux  2.6.33 (at least for x86) the kernel did not check if
1454       events could be scheduled together until read time.  The  same  happens
1455       on all known kernels if the NMI watchdog is enabled.  This means to see
1456       if a given set of events works you have  to  perf_event_open(),  start,
1457       then read before you know for sure you can get valid measurements.
1458
1459       Prior  to  Linux 2.6.34 event constraints were not enforced by the ker‐
1460       nel.  In that case, some events would silently return "0" if the kernel
1461       scheduled them in an improper counter slot.
1462
1463       Prior to Linux 2.6.34 there was a bug when multiplexing where the wrong
1464       results could be returned.
1465
1466       Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the  kernel
1467       if "inherit" is enabled and many threads are started.
1468
1469       Prior  to  Linux  2.6.35,  PERF_FORMAT_GROUP did not work with attached
1470       processes.
1471
1472       In older Linux 2.6 versions, refreshing an event group leader refreshed
1473       all  siblings,  and  refreshing  with a parameter of 0 enabled infinite
1474       refresh.  This behavior is unsupported and should not be relied on.
1475
1476       There is a bug in the kernel code between Linux 2.6.36  and  Linux  3.0
1477       that  ignores  the  "watermark" field and acts as if a wakeup_event was
1478       chosen if the union has a non-zero value in it.
1479
1480       From Linux 2.6.31 to Linux 3.4, the PERF_IOC_FLAG_GROUP ioctl  argument
1481       was  broken  and would repeatedly operate on the event specified rather
1482       than iterating across all sibling events in a group.
1483
1484       Always double-check your results!  Various generalized events have  had
1485       wrong  values.   For example, retired branches measured the wrong thing
1486       on AMD machines until Linux 2.6.35.
1487

EXAMPLE

1489       The following is a short example that measures  the  total  instruction
1490       count of a call to printf(3).
1491
1492       #include <stdlib.h>
1493       #include <stdio.h>
1494       #include <unistd.h>
1495       #include <string.h>
1496       #include <sys/ioctl.h>
1497       #include <linux/perf_event.h>
1498       #include <asm/unistd.h>
1499
1500       long
1501       perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
1502                       int cpu, int group_fd, unsigned long flags)
1503       {
1504           int ret;
1505
1506           ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
1507                          group_fd, flags);
1508           return ret;
1509       }
1510
1511       int
1512       main(int argc, char **argv)
1513       {
1514           struct perf_event_attr pe;
1515           long long count;
1516           int fd;
1517
1518           memset(&pe, 0, sizeof(struct perf_event_attr));
1519           pe.type = PERF_TYPE_HARDWARE;
1520           pe.size = sizeof(struct perf_event_attr);
1521           pe.config = PERF_COUNT_HW_INSTRUCTIONS;
1522           pe.disabled = 1;
1523           pe.exclude_kernel = 1;
1524           pe.exclude_hv = 1;
1525
1526           fd = perf_event_open(&pe, 0, -1, -1, 0);
1527           if (fd == -1) {
1528              fprintf(stderr, "Error opening leader %llx\n", pe.config);
1529              exit(EXIT_FAILURE);
1530           }
1531
1532           ioctl(fd, PERF_EVENT_IOC_RESET, 0);
1533           ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
1534
1535           printf("Measuring instruction count for this printf\n");
1536
1537           ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
1538           read(fd, &count, sizeof(long long));
1539
1540           printf("Used %lld instructions\n", count);
1541
1542           close(fd);
1543       }
1544

SEE ALSO

1546       fcntl(2), mmap(2), open(2), prctl(2), read(2)
1547

COLOPHON

1549       This  page  is  part of release 3.53 of the Linux man-pages project.  A
1550       description of the project, and information about reporting  bugs,  can
1551       be found at http://www.kernel.org/doc/man-pages/.
1552
1553
1554
1555Linux                             2013-07-16                PERF_EVENT_OPEN(2)
Impressum