1fi_cq(3)                       Libfabric v1.18.1                      fi_cq(3)
2
3
4

NAME

6       fi_cq - Completion queue operations
7
8       fi_cq_open / fi_close
9              Open/close a completion queue
10
11       fi_control
12              Control CQ operation or attributes.
13
14       fi_cq_read / fi_cq_readfrom / fi_cq_readerr
15              Read a completion from a completion queue
16
17       fi_cq_sread / fi_cq_sreadfrom
18              A  synchronous (blocking) read that waits until a specified con‐
19              dition has been met before reading a completion from  a  comple‐
20              tion queue.
21
22       fi_cq_signal
23              Unblock any thread waiting in fi_cq_sread or fi_cq_sreadfrom.
24
25       fi_cq_strerror
26              Converts  provider  specific  error information into a printable
27              string
28

SYNOPSIS

30              #include <rdma/fi_domain.h>
31
32              int fi_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
33                  struct fid_cq **cq, void *context);
34
35              int fi_close(struct fid *cq);
36
37              int fi_control(struct fid *cq, int command, void *arg);
38
39              ssize_t fi_cq_read(struct fid_cq *cq, void *buf, size_t count);
40
41              ssize_t fi_cq_readfrom(struct fid_cq *cq, void *buf, size_t count,
42                  fi_addr_t *src_addr);
43
44              ssize_t fi_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf,
45                  uint64_t flags);
46
47              ssize_t fi_cq_sread(struct fid_cq *cq, void *buf, size_t count,
48                  const void *cond, int timeout);
49
50              ssize_t fi_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count,
51                  fi_addr_t *src_addr, const void *cond, int timeout);
52
53              int fi_cq_signal(struct fid_cq *cq);
54
55              const char * fi_cq_strerror(struct fid_cq *cq, int prov_errno,
56                    const void *err_data, char *buf, size_t len);
57

ARGUMENTS

59       domain Open resource domain
60
61       cq     Completion queue
62
63       attr   Completion queue attributes
64
65       context
66              User specified context associated with the completion queue.
67
68       buf    For read calls, the data buffer to write completions into.   For
69              write  calls,  a completion to insert into the completion queue.
70              For fi_cq_strerror, an optional buffer that  receives  printable
71              error information.
72
73       count  Number of CQ entries.
74
75       len    Length of data buffer
76
77       src_addr
78              Source address of a completed receive operation
79
80       flags  Additional flags to apply to the operation
81
82       command
83              Command of control operation to perform on CQ.
84
85       arg    Optional control argument
86
87       cond   Condition that must be met before a completion is generated
88
89       timeout
90              Time  in milliseconds to wait.  A negative value indicates infi‐
91              nite timeout.
92
93       prov_errno
94              Provider specific error value
95
96       err_data
97              Provider specific error data related to a completion
98

DESCRIPTION

100       Completion queues are used to report events associated with data trans‐
101       fers.   They are associated with message sends and receives, RMA, atom‐
102       ic, tagged messages, and triggered events.  Reported events are usually
103       associated with a fabric endpoint, but may also refer to memory regions
104       used as the target of an RMA or atomic operation.
105
106   fi_cq_open
107       fi_cq_open allocates a new completion queue.  Unlike event queues, com‐
108       pletion  queues  are  associated with a resource domain and may be off‐
109       loaded entirely in provider hardware.
110
111       The properties and behavior of a completion queue are defined by struct
112       fi_cq_attr.
113
114              struct fi_cq_attr {
115                  size_t               size;      /* # entries for CQ */
116                  uint64_t             flags;     /* operation flags */
117                  enum fi_cq_format    format;    /* completion format */
118                  enum fi_wait_obj     wait_obj;  /* requested wait object */
119                  int                  signaling_vector; /* interrupt affinity */
120                  enum fi_cq_wait_cond wait_cond; /* wait condition format */
121                  struct fid_wait     *wait_set;  /* optional wait set */
122              };
123
124       size   Specifies  the minimum size of a completion queue.  A value of 0
125              indicates that the provider may choose a default value.
126
127       flags  Flags that control the configuration of the CQ.
128
129       - FI_AFFINITY
130              Indicates that the signaling_vector field (see below) is valid.
131
132       format Completion queues allow the application to select the amount  of
133              detail  that it must store and report.  The format attribute al‐
134              lows the application to select one of  several  completion  for‐
135              mats,  indicating  the structure of the data that the completion
136              queue should return when read.  Supported formats and the struc‐
137              tures  that correspond to each are listed below.  The meaning of
138              the CQ entry fields are defined in the  Completion  Fields  sec‐
139              tion.
140
141       - FI_CQ_FORMAT_UNSPEC
142              If  an  unspecified  format is requested, then the CQ will use a
143              provider selected default format.
144
145       - FI_CQ_FORMAT_CONTEXT
146              Provides only user specified context that  was  associated  with
147              the completion.
148
149              struct fi_cq_entry {
150                  void     *op_context; /* operation context */
151              };
152              • .RS 2
153
154       FI_CQ_FORMAT_MSG
155              Provides  minimal data for processing completions, with expanded
156              support for reporting information about received messages.
157
158              struct fi_cq_msg_entry {
159                  void     *op_context; /* operation context */
160                  uint64_t flags;       /* completion flags */
161                  size_t   len;         /* size of received data */
162              };
163              • .RS 2
164
165       FI_CQ_FORMAT_DATA
166              Provides data associated with a  completion.   Includes  support
167              for  received  message length, remote CQ data, and multi-receive
168              buffers.
169
170              struct fi_cq_data_entry {
171                  void     *op_context; /* operation context */
172                  uint64_t flags;       /* completion flags */
173                  size_t   len;         /* size of received data */
174                  void     *buf;        /* receive data buffer */
175                  uint64_t data;        /* completion data */
176              };
177              • .RS 2
178
179       FI_CQ_FORMAT_TAGGED
180              Expands completion data to include support for the  tagged  mes‐
181              sage interfaces.
182
183              struct fi_cq_tagged_entry {
184                  void     *op_context; /* operation context */
185                  uint64_t flags;       /* completion flags */
186                  size_t   len;         /* size of received data */
187                  void     *buf;        /* receive data buffer */
188                  uint64_t data;        /* completion data */
189                  uint64_t tag;         /* received tag */
190              };
191
192       wait_obj
193              CQ’s  may  be  associated with a specific wait object.  Wait ob‐
194              jects allow applications to block until the wait object is  sig‐
195              naled,  indicating  that  a  completion is available to be read.
196              Users may use fi_control to retrieve the underlying wait  object
197              associated  with a CQ, in order to use it in other system calls.
198              The following values may be used to specify the type of wait ob‐
199              ject   associated   with  a  CQ:  FI_WAIT_NONE,  FI_WAIT_UNSPEC,
200              FI_WAIT_SET, FI_WAIT_FD, FI_WAIT_MUTEX_COND, and  FI_WAIT_YIELD.
201              The default is FI_WAIT_NONE.
202
203       - FI_WAIT_NONE
204              Used to indicate that the user will not block (wait) for comple‐
205              tions on the CQ.  When FI_WAIT_NONE is specified,  the  applica‐
206              tion may not call fi_cq_sread or fi_cq_sreadfrom.
207
208       - FI_WAIT_UNSPEC
209              Specifies  that  the  user will only wait on the CQ using fabric
210              interface calls, such as  fi_cq_sread  or  fi_cq_sreadfrom.   In
211              this case, the underlying provider may select the most appropri‐
212              ate or highest performing wait object available, including  cus‐
213              tom  wait  mechanisms.   Applications that select FI_WAIT_UNSPEC
214              are not guaranteed to retrieve the underlying wait object.
215
216       - FI_WAIT_SET
217              Indicates that the completion queue should use a wait set object
218              to  wait for completions.  If specified, the wait_set field must
219              reference an existing wait set object.
220
221       - FI_WAIT_FD
222              Indicates that the CQ should use a file descriptor as  its  wait
223              mechanism.   A file descriptor wait object must be usable in se‐
224              lect, poll, and epoll routines.  However, a provider may  signal
225              an  FD  wait object by marking it as readable, writable, or with
226              an error.
227
228       - FI_WAIT_MUTEX_COND
229              Specifies that the CQ should use a pthread mutex and cond  vari‐
230              able as a wait object.
231
232       - FI_WAIT_YIELD
233              Indicates  that  the  CQ will wait without a wait object but in‐
234              stead yield on every wait.   Allows  usage  of  fi_cq_sread  and
235              fi_cq_sreadfrom through a spin.
236
237       signaling_vector
238              If  the  FI_AFFINITY flag is set, this indicates the logical cpu
239              number (0..max cpu - 1) that interrupts associated with  the  CQ
240              should  target.   This  field should be treated as a hint to the
241              provider and may be ignored if the provider does not support in‐
242              terrupt affinity.
243
244       wait_cond
245              By  default,  when  a completion is inserted into a CQ that sup‐
246              ports blocking reads (fi_cq_sread/fi_cq_sreadfrom),  the  corre‐
247              sponding wait object is signaled.  Users may specify a condition
248              that must first be met before the wait is satisfied.  This field
249              indicates  how  the  provider  should  interpret the cond field,
250              which describes the condition needed to signal the wait object.
251
252       A wait condition should be treated as an optimization.   Providers  are
253       not required to meet the requirements of the condition before signaling
254       the wait object.  Applications should not rely on the condition  neces‐
255       sarily being true when a blocking read call returns.
256
257       If  wait_cond  is set to FI_CQ_COND_NONE, then no additional conditions
258       are applied to the signaling of the CQ wait object, and  the  insertion
259       of  any new entry will trigger the wait condition.  If wait_cond is set
260       to FI_CQ_COND_THRESHOLD, then the cond field is interpreted as a size_t
261       threshold  value.   The  threshold indicates the number of entries that
262       are to be queued before at the CQ before the wait is satisfied.
263
264       This field is ignored if wait_obj is set to FI_WAIT_NONE.
265
266       wait_set
267              If wait_obj is FI_WAIT_SET, this field references a wait  object
268              to  which  the completion queue should attach.  When an event is
269              inserted into the completion queue, the corresponding  wait  set
270              will  be  signaled if all necessary conditions are met.  The use
271              of a wait_set enables an optimized method of waiting for  events
272              across  multiple event and completion queues.  This field is ig‐
273              nored if wait_obj is not FI_WAIT_SET.
274
275   fi_close
276       The fi_close call releases all resources associated with  a  completion
277       queue.   Any  completions  which remain on the CQ when it is closed are
278       lost.
279
280       When closing the CQ, there must be no opened endpoints,  transmit  con‐
281       texts,  or  receive  contexts associated with the CQ.  If resources are
282       still associated with the CQ when attempting to close,  the  call  will
283       return -FI_EBUSY.
284
285   fi_control
286       The  fi_control  call is used to access provider or implementation spe‐
287       cific details of the completion queue.  Access to the CQ should be  se‐
288       rialized  across  all calls when fi_control is invoked, as it may redi‐
289       rect the implementation of CQ operations.  The following  control  com‐
290       mands are usable with a CQ.
291
292       FI_GETWAIT (void **)
293              This  command allows the user to retrieve the low-level wait ob‐
294              ject associated with the CQ.  The format of the  wait-object  is
295              specified  during  CQ  creation, through the CQ attributes.  The
296              fi_control arg parameter should be an address where a pointer to
297              the returned wait object will be written.  See fi_eq.3 for addi‐
298              tion details using fi_control with FI_GETWAIT.
299
300   fi_cq_read
301       The fi_cq_read operation performs a non-blocking read of completion da‐
302       ta from the CQ.  The format of the completion event is determined using
303       the fi_cq_format option that was specified  when  the  CQ  was  opened.
304       Multiple  completions may be retrieved from a CQ in a single call.  The
305       maximum number of entries to return is limited to the  specified  count
306       parameter, with the number of entries successfully read from the CQ re‐
307       turned by the call.  (See return values section below.) A  count  value
308       of  0 may be used to drive progress on associated endpoints when manual
309       progress is enabled.
310
311       CQs are optimized to report operations which have completed successful‐
312       ly.  Operations which fail are reported `out of band'.  Such operations
313       are retrieved using the fi_cq_readerr function.  When an operation that
314       has completed with an unexpected error is encountered, it is placed in‐
315       to a temporary error queue.  Attempting to read from a CQ while an item
316       is  in the error queue results in fi_cq_read failing with a return code
317       of -FI_EAVAIL.  Applications may use this return code to determine when
318       to call fi_cq_readerr.
319
320   fi_cq_readfrom
321       The  fi_cq_readfrom  call behaves identical to fi_cq_read, with the ex‐
322       ception that it allows the CQ to return source address  information  to
323       the  user for any received data.  Source address data is only available
324       for  those  endpoints  configured  with   FI_SOURCE   capability.    If
325       fi_cq_readfrom is called on an endpoint for which source addressing da‐
326       ta is not available, the source address  will  be  set  to  FI_ADDR_NO‐
327       TAVAIL.   The  number of input src_addr entries must be the same as the
328       count parameter.
329
330       Returned source addressing data is converted from  the  native  address
331       used  by  the underlying fabric into an fi_addr_t, which may be used in
332       transmit operations.  Under most circumstances, returning fi_addr_t re‐
333       quires  that the source address already have been inserted into the ad‐
334       dress vector associated with the receiving endpoint.  This is true  for
335       address   vectors  of  type  FI_AV_TABLE.   In  select  providers  when
336       FI_AV_MAP is used, source addresses may  be  converted  algorithmically
337       into  a  usable  fi_addr_t, even though the source address has not been
338       inserted into the address vector.  This is permitted by the API, as  it
339       allows the provider to avoid address look-up as part of receive message
340       processing.  In no case do providers insert addresses into an AV  sepa‐
341       rate from an application calling fi_av_insert or similar call.
342
343       For  endpoints  allocated  using  the  FI_SOURCE_ERR capability, if the
344       source address cannot  be  converted  into  a  valid  fi_addr_t  value,
345       fi_cq_readfrom  will  return -FI_EAVAIL, even if the data were received
346       successfully.  The completion will then be reported through fi_cq_read‐
347       err with error code -FI_EADDRNOTAVAIL.  See fi_cq_readerr for details.
348
349       If FI_SOURCE is specified without FI_SOURCE_ERR, source addresses which
350       cannot be mapped to a usable fi_addr_t will be reported as  FI_ADDR_NO‐
351       TAVAIL.
352
353   fi_cq_sread / fi_cq_sreadfrom
354       The  fi_cq_sread  and fi_cq_sreadfrom calls are the blocking equivalent
355       operations to fi_cq_read and fi_cq_readfrom.  Their behavior is similar
356       to  the  non-blocking calls, with the exception that the calls will not
357       return until either a completion has been read from the CQ or an  error
358       or timeout occurs.
359
360       Threads blocking in this function will return to the caller if they are
361       signaled by some external source.  This is true even if the timeout has
362       not occurred or was specified as infinite.
363
364       It  is  invalid  for applications to call these functions if the CQ has
365       been configured with a wait object of FI_WAIT_NONE or FI_WAIT_SET.
366
367   fi_cq_readerr
368       The read error function, fi_cq_readerr, retrieves information regarding
369       any  asynchronous  operation which has completed with an unexpected er‐
370       ror.  fi_cq_readerr  is  a  non-blocking  call,  returning  immediately
371       whether an error completion was found or not.
372
373       Error  information is reported to the user through struct fi_cq_err_en‐
374       try.  The format of this structure is defined below.
375
376              struct fi_cq_err_entry {
377                  void     *op_context; /* operation context */
378                  uint64_t flags;       /* completion flags */
379                  size_t   len;         /* size of received data */
380                  void     *buf;        /* receive data buffer */
381                  uint64_t data;        /* completion data */
382                  uint64_t tag;         /* message tag */
383                  size_t   olen;        /* overflow length */
384                  int      err;         /* positive error code */
385                  int      prov_errno;  /* provider error code */
386                  void    *err_data;    /*  error data */
387                  size_t   err_data_size; /* size of err_data */
388              };
389
390       The general reason for the error is provided  through  the  err  field.
391       Provider  specific  error information may also be available through the
392       prov_errno and err_data fields.  Users may call fi_cq_strerror to  con‐
393       vert  provider  specific  error information into a printable string for
394       debugging purposes.  See field details below for  more  information  on
395       the use of err_data and err_data_size.
396
397       Note that error completions are generated for all operations, including
398       those for which a completion was not  requested  (e.g. an  endpoint  is
399       configured  with  FI_SELECTIVE_COMPLETION, but the request did not have
400       the FI_COMPLETION flag set).  In such cases, providers will  return  as
401       much information as made available by the underlying software and hard‐
402       ware about the failure, other fields will be set to NULL  or  0.   This
403       includes  the op_context value, which may not have been provided or was
404       ignored on input as part of the transfer.
405
406       Notable completion error codes are given below.
407
408       FI_EADDRNOTAVAIL
409              This error code is used by CQs configured with FI_SOURCE_ERR  to
410              report  completions  for which a usable fi_addr_t source address
411              could not be found.  An error code of FI_EADDRNOTAVAIL indicates
412              that  the data transfer was successfully received and processed,
413              with the fi_cq_err_entry fields containing information about the
414              completion.   The  err_data  field will be set to the source ad‐
415              dress data.  The source address will be in the  same  format  as
416              specified  through  the fi_info addr_format field for the opened
417              domain.  This may be passed directly into an  fi_av_insert  call
418              to add the source address to the address vector.
419
420   fi_cq_signal
421       The fi_cq_signal call will unblock any thread waiting in fi_cq_sread or
422       fi_cq_sreadfrom.  This may be used to wake-up a thread that is  blocked
423       waiting  to read a completion operation.  The fi_cq_signal operation is
424       only available if the CQ was configured with a wait object.
425

COMPLETION FIELDS

427       The CQ entry data structures share many of the same fields.  The  mean‐
428       ings of these fields are the same for all CQ entry structure formats.
429
430       op_context
431              The operation context is the application specified context value
432              that was provided with an asynchronous operation.   The  op_con‐
433              text field is valid for all completions that are associated with
434              an asynchronous operation.
435
436       For completion events that are not associated with a posted  operation,
437       this field will be set to NULL.  This includes completions generated at
438       the target in response to RMA  write  operations  that  carry  CQ  data
439       (FI_REMOTE_WRITE | FI_REMOTE_CQ_DATA flags set), when the FI_RX_CQ_DATA
440       mode bit is not required.
441
442       flags  This specifies flags associated with  the  completed  operation.
443              The  Completion  Flags  section  below  lists valid flag values.
444              Flags are set for all relevant completions.
445
446       len    This  len  field  applies  to   completed   receive   operations
447              (e.g. fi_recv,  fi_trecv, etc.) and the completed write with re‐
448              mote cq data on the responder side (e.g. fi_write,  with  FI_RE‐
449              MOTE_CQ_DATA  flag).   It indicates the size of transferred mes‐
450              sage data – i.e. how many data bytes were placed into the  asso‐
451              ciated     receive/target     buffer    by    a    corresponding
452              fi_send/fi_tsend/fi_write et al call.  If an endpoint  has  been
453              configured  with  the  FI_MSG_PREFIX mode, the len also reflects
454              the size of the prefix buffer.
455
456       buf    The buf field is only valid for  completed  receive  operations,
457              and  only  applies  when  the receive buffer was posted with the
458              FI_MULTI_RECV flag.  In this case, buf points  to  the  starting
459              location where the receive data was placed.
460
461       data   The data field is only valid if the FI_REMOTE_CQ_DATA completion
462              flag is set, and only applies to receive completions.  If FI_RE‐
463              MOTE_CQ_DATA is set, this field will contain the completion data
464              provided by the peer as part of  their  transmit  request.   The
465              completion data will be given in host byte order.
466
467       tag    A  tag  applies  only  to received messages that occur using the
468              tagged interfaces.  This field contains the tag that was includ‐
469              ed  with the received message.  The tag will be in host byte or‐
470              der.
471
472       olen   The olen field applies to received messages.  It is used to  in‐
473              dicate  that a received message has overrun the available buffer
474              space and has been truncated.  The olen specifies the amount  of
475              data  that did not fit into the available receive buffer and was
476              discarded.
477
478       err    This err code is a positive fabric errno associated with a  com‐
479              pletion.   The err value indicates the general reason for an er‐
480              ror, if one occurred.  See fi_errno.3 for a list of possible er‐
481              ror codes.
482
483       prov_errno
484              On  an  error,  prov_errno may contain a provider specific error
485              code.  The use of this field and its meaning is provider specif‐
486              ic.   It  is  intended  to  be  used  as  a  debugging aid.  See
487              fi_cq_strerror for additional details on converting  this  error
488              value into a human readable string.
489
490       err_data
491              The  err_data field is used to return provider specific informa‐
492              tion, if available, about the error.  On input, err_data  should
493              reference  a  data buffer of size err_data_size.  On output, the
494              provider will fill in this buffer with any provider specific da‐
495              ta which may help identify the cause of the error.  The contents
496              of the err_data field and its meaning is provider specific.   It
497              is  intended  to be used as a debugging aid.  See fi_cq_strerror
498              for additional details on converting this error data into a  hu‐
499              man  readable  string.   See the compatibility note below on how
500              this field is used for older libfabric releases.
501
502       err_data_size
503              On input, err_data_size indicates the size of the err_data  buf‐
504              fer  in bytes.  On output, err_data_size will be set to the num‐
505              ber of bytes copied to the err_data buffer.  The err_data infor‐
506              mation  is typically used with fi_cq_strerror to provide details
507              about the type of error that occurred.
508
509       For compatibility purposes, the behavior of the  err_data  and  err_da‐
510       ta_size  fields  is may be modified from that listed above.  If err_da‐
511       ta_size is 0 on input, or the fabric was opened  with  release  <  1.5,
512       then  any  buffer  referenced by err_data will be ignored on input.  In
513       this situation, on output err_data will be set to a data  buffer  owned
514       by  the provider.  The contents of the buffer will remain valid until a
515       subsequent read call against the CQ.  Applications must  serialize  ac‐
516       cess  to the CQ when processing errors to ensure that the buffer refer‐
517       enced by err_data does not change.
518

COMPLETION FLAGS

520       Completion flags provide additional details regarding the completed op‐
521       eration.  The following completion flags are defined.
522
523       FI_SEND
524              Indicates  that  the  completion was for a send operation.  This
525              flag may be combined with an FI_MSG or FI_TAGGED flag.
526
527       FI_RECV
528              Indicates that the completion was for a receive operation.  This
529              flag may be combined with an FI_MSG or FI_TAGGED flag.
530
531       FI_RMA Indicates  that  an  RMA  operation completed.  This flag may be
532              combined with an FI_READ, FI_WRITE,  FI_REMOTE_READ,  or  FI_RE‐
533              MOTE_WRITE flag.
534
535       FI_ATOMIC
536              Indicates  that an atomic operation completed.  This flag may be
537              combined with an FI_READ, FI_WRITE,  FI_REMOTE_READ,  or  FI_RE‐
538              MOTE_WRITE flag.
539
540       FI_MSG Indicates  that  a message-based operation completed.  This flag
541              may be combined with an FI_SEND or FI_RECV flag.
542
543       FI_TAGGED
544              Indicates that a tagged message operation completed.  This  flag
545              may be combined with an FI_SEND or FI_RECV flag.
546
547       FI_MULTICAST
548              Indicates  that  a multicast operation completed.  This flag may
549              be combined with FI_MSG and relevant flags.  This flag  is  only
550              guaranteed to be valid for received messages if the endpoint has
551              been configured with FI_SOURCE.
552
553       FI_READ
554              Indicates that a locally initiated RMA or atomic read  operation
555              has  completed.   This  flag  may  be combined with an FI_RMA or
556              FI_ATOMIC flag.
557
558       FI_WRITE
559              Indicates that a locally initiated RMA or atomic write operation
560              has  completed.   This  flag  may  be combined with an FI_RMA or
561              FI_ATOMIC flag.
562
563       FI_REMOTE_READ
564              Indicates that a remotely initiated RMA or atomic read operation
565              has  completed.   This  flag  may  be combined with an FI_RMA or
566              FI_ATOMIC flag.
567
568       FI_REMOTE_WRITE
569              Indicates that a remotely initiated RMA or atomic  write  opera‐
570              tion has completed.  This flag may be combined with an FI_RMA or
571              FI_ATOMIC flag.
572
573       FI_REMOTE_CQ_DATA
574              This indicates that remote CQ data is available as part  of  the
575              completion.
576
577       FI_MULTI_RECV
578              This  flag  applies to receive buffers that were posted with the
579              FI_MULTI_RECV flag set.  This completion flag indicates that the
580              original  receive  buffer  referenced by the completion has been
581              consumed and was released by the provider.   Providers  may  set
582              this  flag  on the last message that is received into the multi-
583              recv buffer, or may generate a separate  completion  that  indi‐
584              cates that the buffer has been released.
585
586       Applications  can  distinguish between these two cases by examining the
587       completion entry flags field.  If additional flags,  such  as  FI_RECV,
588       are set, the completion is associated with a received message.  In this
589       case, the buf field will reference the location where the received mes‐
590       sage  was  placed into the multi-recv buffer.  Other fields in the com‐
591       pletion entry will be determined based on  the  received  message.   If
592       other flag bits are zero, the provider is reporting that the multi-recv
593       buffer has been released, and the completion entry  is  not  associated
594       with a received message.
595
596       FI_MORE
597              See  the  `Buffered  Receives' section in fi_msg(3) for more de‐
598              tails.  This flag is associated with receive completions on end‐
599              points  that  have  FI_BUFFERED_RECV  mode enabled.  When set to
600              one, it indicates that the buffer referenced by  the  completion
601              is limited by the FI_OPT_BUFFERED_LIMIT threshold, and addition‐
602              al message data must be retrieved by the  application  using  an
603              FI_CLAIM operation.
604
605       FI_CLAIM
606              See  the  `Buffered  Receives' section in fi_msg(3) for more de‐
607              tails.  This flag is set on completions associated with  receive
608              operations  that  claim  buffered  receive data.  Note that this
609              flag   only   applies   to   endpoints   configured   with   the
610              FI_BUFFERED_RECV mode bit.
611

COMPLETION EVENT SEMANTICS

613       Libfabric  defines several completion `levels', identified using opera‐
614       tional flags.  Each flag indicates the soonest that a completion  event
615       may be generated by a provider, and the assumptions that an application
616       may make upon processing a completion.  The operational flags  are  de‐
617       fined  below,  along  with an example of how a provider might implement
618       the semantic.  Note that only meeting the semantic is required  of  the
619       provider  and not the implementation.  Providers may implement stronger
620       completion semantics than necessary for a given operation, but only the
621       behavior defined by the completion level is guaranteed.
622
623       To  help  understand  the  conceptual differences in completion levels,
624       consider mailing a letter.  Placing the letter into the  local  mailbox
625       for  pick-up is similar to `inject complete'.  Having the letter picked
626       up and dropped off at the destination mailbox is equivalent to  `trans‐
627       mit  complete'.  The `delivery complete' semantic is a stronger guaran‐
628       tee, with a person at the destination signing for the letter.  However,
629       the  person  who  signed for the letter is not necessarily the intended
630       recipient.  The `match complete' option is  similar  to  delivery  com‐
631       plete, but requires the intended recipient to sign for the letter.
632
633       The `commit complete' level has different semantics than the previously
634       mentioned levels.  Commit complete would be closer to the letter arriv‐
635       ing at the destination and being placed into a fire proof safe.
636
637       The  operational  flags for the described completion levels are defined
638       below.
639
640       FI_INJECT_COMPLETE
641              Indicates that a completion should be generated when the  source
642              buffer(s)  may be reused.  A completion guarantees that the buf‐
643              fers will not be read from again and the application may reclaim
644              them.  No other guarantees are made with respect to the state of
645              the operation.
646
647       Example: A provider may generate this completion  event  after  copying
648       the  source  buffer  into a network buffer, either in host memory or on
649       the NIC.  An inject completion does not indicate that the data has been
650       transmitted  onto  the network, and a local error could occur after the
651       completion event has been generated that could prevent  it  from  being
652       transmitted.
653
654       Inject  complete  allows  for  the  fastest  completion reporting (and,
655       hence, buffer reuse), but provides the weakest guarantees against  net‐
656       work errors.
657
658       Note:  This flag is used to control when a completion entry is inserted
659       into a completion queue.  It does not apply to operations that  do  not
660       generate a completion queue entry, such as the fi_inject operation, and
661       is not subject to the inject_size message limit restriction.
662
663       FI_TRANSMIT_COMPLETE
664              Indicates that a completion should be generated when the  trans‐
665              mit operation has completed relative to the local provider.  The
666              exact behavior is dependent on the endpoint type.
667
668       For reliable endpoints:
669
670       Indicates that a completion should be generated when the operation  has
671       been  delivered to the peer endpoint.  A completion guarantees that the
672       operation is no longer dependent on the fabric or local resources.  The
673       state of the operation at the peer endpoint is not defined.
674
675       Example: A provider may generate a transmit complete event upon receiv‐
676       ing an ack from the peer endpoint.  The state of  the  message  at  the
677       peer  is  unknown and may be buffered in the target NIC at the time the
678       ack has been generated.
679
680       For unreliable endpoints:
681
682       Indicates that a completion should be generated when the operation  has
683       been  delivered to the fabric.  A completion guarantees that the opera‐
684       tion is no longer dependent on local resources.  The state of the oper‐
685       ation within the fabric is not defined.
686
687       FI_DELIVERY_COMPLETE
688              Indicates that a completion should not be generated until an op‐
689              eration has been processed by the  destination  endpoint(s).   A
690              completion guarantees that the result of the operation is avail‐
691              able; however, additional steps may need to be taken at the des‐
692              tination  to  retrieve the results.  For example, an application
693              may need to provide a receive buffers in order to retrieve  mes‐
694              sages that were buffered by the provider.
695
696       Delivery  complete indicates that the message has been processed by the
697       peer.  If an application buffer was ready to receive the results of the
698       message when it arrived, then delivery complete indicates that the data
699       was placed into the application’s buffer.
700
701       This completion mode applies only to reliable  endpoints.   For  opera‐
702       tions  that  return  data to the initiator, such as RMA read or atomic-
703       fetch, the source endpoint is also considered a  destination  endpoint.
704       This is the default completion mode for such operations.
705
706       FI_MATCH_COMPLETE
707              Indicates  that  a completion should be generated only after the
708              operation has been matched with an application specified buffer.
709              Operations  using  this completion semantic are dependent on the
710              application at the target claiming the message or results.  As a
711              result, match complete may involve additional provider level ac‐
712              knowledgements or lengthy delays.  However, this completion mod‐
713              el  enables  peer  applications  to synchronize their execution.
714              Many providers may not support this semantic.
715
716       FI_COMMIT_COMPLETE
717              Indicates that a completion should not be generated (locally  or
718              at  the  peer)  until  the result of an operation have been made
719              persistent.  A completion guarantees that  the  result  is  both
720              available and durable, in the case of power failure.
721
722       This  completion mode applies only to operations that target persistent
723       memory regions over reliable endpoints.  This completion mode is exper‐
724       imental.
725
726       FI_FENCE
727              This  is not a completion level, but plays a role in the comple‐
728              tion ordering between operations that would not normally be  or‐
729              dered.   An  operation that is marked with the FI_FENCE flag and
730              all operations posted after the fenced  operation  are  deferred
731              until  all  previous operations targeting the same peer endpoint
732              have completed.  Additionally, the completion of the fenced  op‐
733              eration  indicates  that prior operations have met the same com‐
734              pletion level as the fenced operation.  For example, if an oper‐
735              ation  is  posted  as  FI_DELIVERY_COMPLETE | FI_FENCE, then its
736              completion indicates prior operations have met the semantic  re‐
737              quired for FI_DELIVERY_COMPLETE.  This is true even if the prior
738              operation was posted with a  lower  completion  level,  such  as
739              FI_TRANSMIT_COMPLETE or FI_INJECT_COMPLETE.
740
741       Note  that  a completion generated for an operation posted prior to the
742       fenced operation only guarantees that the  completion  level  that  was
743       originally  requested has been met.  It is the completion of the fenced
744       operation that guarantees that the additional semantics have been met.
745
746       The above completion semantics are defined with respect to the  initia‐
747       tor  of the operation.  The different semantics are useful for describ‐
748       ing when the initiator may re-use a data buffer,  and  guarantees  what
749       state  a  transfer  must  reach  prior to a completion being generated.
750       This allows applications to determine  appropriate  error  handling  in
751       case of communication failures.
752

TARGET COMPLETION SEMANTICS

754       The completion semantic at the target is used to determine when data at
755       the target is visible to the peer  application.   Visibility  indicates
756       that  a  memory  read to the same address that was the target of a data
757       transfer will return the results of the  transfer.   The  target  of  a
758       transfer can be identified by the initiator, as may be the case for RMA
759       and atomic operations, or determined by the target, for example by pro‐
760       viding a matching receive buffer.  Global visibility indicates that the
761       results are available regardless of where the memory  read  originates.
762       For  example, the read could come from a process running on a host CPU,
763       it may be accessed by subsequent data transfer over the fabric, or read
764       from a peer device such as a GPU.
765
766       In terms of completion semantics, visibility usually indicates that the
767       transfer meets the FI_DELIVERY_COMPLETE requirements from the  perspec‐
768       tive  of the target.  The target completion semantic may be, but is not
769       necessarily, linked with the completion semantic specified by the  ini‐
770       tiator of the transfer.
771
772       Often,  target  processes  do not explicitly state a desired completion
773       semantic and instead rely on the default semantic.  The default  behav‐
774       ior is based on several factors, including:
775
776       • whether a completion even is generated at the target
777
778       • the type of transfer involved (e.g. msg vs RMA)
779
780       • endpoint data and message ordering guarantees
781
782       • properties of the targeted memory buffer
783
784       • the initiator’s specified completion semantic
785
786       Broadly,  target  completion  semantics are grouped based on whether or
787       not the transfer generates a completion event at the target.  This  in‐
788       cludes  writing a CQ entry or updating a completion counter.  In common
789       use cases, transfers that use a message interface (FI_MSG or FI_TAGGED)
790       typically  generate target events, while transfers involving an RMA in‐
791       terface (FI_RMA or FI_ATOMIC) often do not.  There  are  exceptions  to
792       both  these cases, depending on endpoint to CQ and counter bindings and
793       operational flags.  For example, RMA writes that carry remote  CQ  data
794       will generate a completion event at the target, and are frequently used
795       to convey visibility to the target application.  The general guidelines
796       for  target  side semantics are described below, followed by exceptions
797       that modify that behavior.
798
799       By default, completions generated  at  the  target  indicate  that  the
800       transferred  data  is  immediately available to be read from the target
801       buffer.  That is, the target sees FI_DELIVERY_COMPLETE (or better)  se‐
802       mantics, even if the initiator requested lower semantics.  For applica‐
803       tions using only data buffers allocated from host memory, this is often
804       sufficient.
805
806       For  operations  that do not generate a completion event at the target,
807       the visibility of the data at the target may need to be inferred  based
808       on subsequent operations that do generate target completions.  Absent a
809       target completion, when a completion of an operation is written at  the
810       initiator,  the  visibility  semantic  of  the  operation at the target
811       aligns with the initiator completion semantic.  For instance, if an RMA
812       operation  completes  at  the initiator as either FI_INJECT_COMPLETE or
813       FI_TRANSMIT_COMPLETE, the data visibility at the target is not  guaran‐
814       teed.
815
816       One  or  more  of  the  following  mechanisms can be used by the target
817       process to guarantee that the results of a data transfer that  did  not
818       generate  a  completion at the target is now visible.  This list is not
819       inclusive of all options, but defines common uses.  In the descriptions
820       below,  the first transfer does not result in a completion event at the
821       target, but is eventually followed by a transfer which does.
822
823       • If the endpoint guarantees message ordering  between  two  transfers,
824         the target completion of a second transfer will indicate that the da‐
825         ta from the first transfer is available.  For example,  if  the  end‐
826         point  supports  send after write ordering (FI_ORDER_SAW), then a re‐
827         ceive completion corresponding to the send  will  indicate  that  the
828         write  data  is available.  This holds independent of the initiator’s
829         completion semantic for either the write or send.  When  ordering  is
830         guaranteed, the second transfer can be queued with the provider imme‐
831         diately after queuing the first.
832
833       • If the endpoint does not guarantee message  ordering,  the  initiator
834         must  take  additional  steps to ensure visibility.  If initiator re‐
835         quests FI_DELIVERY_COMPLETE semantics for the  first  operation,  the
836         initiator  can  wait for the operation to complete locally.  Once the
837         completion has been read, the target completion of a second  transfer
838         will indicate that the first transfer’s data is visible.
839
840       • Alternatively, if message ordering is not guaranteed by the endpoint,
841         the initiator can use the FI_FENCE and FI_DELIVERY_COMPLETE flags  on
842         the  second  data  transfer  to force the first transfers to meet the
843         FI_DELIVERY_COMPLETE semantics.  If the second transfer  generates  a
844         completion  at  the target, that will indicate that the data is visi‐
845         ble.  Otherwise, a target  completion  for  any  transfer  after  the
846         fenced operation will indicate that the data is visible.
847
848       The above semantics apply for transfers targeting traditional host mem‐
849       ory buffers.  However, the  behavior  may  differ  when  device  memory
850       and/or  persistent  memory  is involved (FI_HMEM and FI_PMEM capability
851       bits).  When heterogenous memory is involved, the concept of memory do‐
852       mains  come into play.  Memory domains identify the physical separation
853       of memory, which may or may not be accessible through the same  virtual
854       address space.  See the fi_mr(3) man page for further details on memory
855       domains.
856
857       Completion ordering and  data  visibility  are  only  well-defined  for
858       transfers  that target the same memory domain.  Applications need to be
859       aware of ordering and visibility differences when transfers target dif‐
860       ferent memory domains.  Additionally, applications also need to be con‐
861       cerned with the memory domain that completions themselves  are  written
862       and  if  it  differs from the memory domain targeted by a transfer.  In
863       some situations, either the provider or application may  need  to  call
864       device  specific  APIs  to synchronize or flush device memory caches in
865       order to achieve the desired data visibility.
866
867       When heterogenous memory is in use, the default target  completion  se‐
868       mantic  for transfers that generate a completion at the target is still
869       FI_DELIVERY_COMPLETE, however, applications should be aware that  there
870       may  be  a negative impact on overall performance for providers to meet
871       this requirement.
872
873       For example, a target process may be using a GPU to accelerate computa‐
874       tions.   A memory region mapping to memory on the GPU may be exposed to
875       peers as either an RMA target or posted locally as  a  receive  buffer.
876       In  this  case,  the application is concerned with two memory domains –
877       system and GPU memory.  Completions are written to system memory.
878
879       Continuing the example, a peer process sends a  tagged  message.   That
880       message  is matched with the receive buffer located in GPU memory.  The
881       NIC copies the data from the network into the receive buffer and writes
882       an entry into the completion queue.  Note that both memory domains were
883       accessed as part of this transfer.  The message data  was  directed  to
884       the  GPU memory, but the completion went to host memory.  Because sepa‐
885       rate memory domains may not be synchronized with each other, it is pos‐
886       sible  for  the host CPU to see and process the completion entry before
887       the transfer to the GPU memory is visible to either  the  host  GPU  or
888       even  software  running  on  the  GPU.   From  the  perspective  of the
889       provider, visibility of the completion does not imply visibility of da‐
890       ta written to the GPU’s memory domain.
891
892       The  default  completion semantic at the target application for message
893       operations is FI_DELIVERY_COMPLETE.  An anticipated provider  implemen‐
894       tation  in  this  situation is for the provider software running on the
895       host CPU to intercept the CQ entry, detect that the data landed in het‐
896       erogenous  memory,  and perform the necessary device synchronization or
897       flush operation before reporting the completion up to the  application.
898       This  ensures that the data is visible to CPU and GPU software prior to
899       the application processing the completion.
900
901       In addition to the cost of provider software  intercepting  completions
902       and  checking  if  a transfer targeted heterogenous memory, device syn‐
903       chronization itself may impact performance.  As a result,  applications
904       can  request  a  lower completion semantic when posting receives.  That
905       indicates to the provider that the application will be responsible  for
906       handling  any  device  specific  flush operations that might be needed.
907       See fi_msg(3) FLAGS.
908
909       For data transfers that do not generate a  completion  at  the  target,
910       such  as RMA or atomics, it is the responsibility of the application to
911       ensure that all target buffers meet the necessary  visibility  require‐
912       ments  of  the  application.  The previously mentioned bulleted methods
913       for notifying the target that the data is visible  may  not  be  suffi‐
914       cient,  as  the  provider software at the target could lack the context
915       needed to ensure visibility.  This implies  that  the  application  may
916       need to call device synchronization/flush APIs directly.
917
918       For  example,  a peer application could perform several RMA writes that
919       target GPU memory buffers.  If the provider offloads RMA operations in‐
920       to  the  NIC,  the provider software at the target will be unaware that
921       the RMA operations have occurred.  If the peer sends a message  to  the
922       target application that indicates that the RMA operations are done, the
923       application must ensure that the RMA data is visible to the host CPU or
924       GPU prior to executing code that accesses the data.  The target comple‐
925       tion of having received the sent message is  not  sufficient,  even  if
926       send-after-write ordering is supported.
927
928       Most  target  heterogenous memory completion semantics map to FI_TRANS‐
929       MIT_COMPLETE or FI_DELIVERY_COMPLETE.  Persistent memory (FI_PMEM capa‐
930       bility),  however,  is  often  used  with FI_COMMIT_COMPLETE semantics.
931       Heterogenous completion concepts still apply.
932
933       For transfers flagged by the initiator with FI_COMMIT_COMPLETE, a  com‐
934       pletion  at  the  target  indicates  that  the  results are visible and
935       durable.  For transfers targeting persistent memory, but using  a  dif‐
936       ferent completion semantic at the initiator, the visibility at the tar‐
937       get is similar to that described above.  Durability is only  associated
938       with transfers marked with FI_COMMIT_COMPLETE.
939
940       For transfers targeting persistent memory that request FI_DELIVERY_COM‐
941       PLETE, then a completion, at either the initiator or target,  indicates
942       that the data is visible.  Visibility at the target can be conveyed us‐
943       ing one of the above describe mechanism – generating a  target  comple‐
944       tion,  sending  a  message  from the initiator, etc.  Similarly, if the
945       initiator requested FI_TRANSMIT_COMPLETE,  then  additional  steps  are
946       needed  to  ensure visibility at the target.  For example, the transfer
947       can generate a completion at the target, which would indicate visibili‐
948       ty,  but  not  durability.   The initiator can also follow the transfer
949       with another operation that forces visibility, such as  using  FI_FENCE
950       in conjunction with FI_DELIVERY_COMPLETE.
951

NOTES

953       A  completion  queue must be bound to at least one enabled endpoint be‐
954       fore any operation such  as  fi_cq_read,  fi_cq_readfrom,  fi_cq_sread,
955       fi_cq_sreadfrom etc.  can be called on it.
956
957       Completion flags may be suppressed if the FI_NOTIFY_FLAGS_ONLY mode bit
958       has been set.  When enabled, only the following flags are guaranteed to
959       be  set  in  completion  data  when  they are valid: FI_REMOTE_READ and
960       FI_REMOTE_WRITE (when FI_RMA_EVENT capability bit has been set), FI_RE‐
961       MOTE_CQ_DATA, and FI_MULTI_RECV.
962
963       If  a  completion  queue  has  been  overrun, it will be placed into an
964       `overrun' state.  Read operations will continue to  return  any  valid,
965       non-corrupted  completions,  if available.  After all valid completions
966       have been retrieved, any attempt to read the CQ will result in  it  re‐
967       turning an FI_EOVERRUN error event.  Overrun completion queues are con‐
968       sidered fatal and may not be used to report additional completions once
969       the overrun occurs.
970

RETURN VALUES

972   fi_cq_open / fi_cq_signal
973       : Returns 0 on success.  On error, returns a negative fabric errno.
974
975   fi_cq_read / fi_cq_readfrom
976       : On success, returns the number of completions retrieved from the com‐
977       pletion queue.  On error, returns a negative fabric errno,  with  these
978       two  errors  explicitly  identified: If no completions are available to
979       read from the CQ, returns -FI_EAGAIN.  If the topmost completion is for
980       a failed transfer (an error entry), returns -FI_EAVAIL.
981
982   fi_cq_sread / fi_cq_sreadfrom
983       : On success, returns the number of completions retrieved from the com‐
984       pletion queue.  On error, returns a negative fabric errno,  with  these
985       two errors explicitly identified: If the timeout expires or the calling
986       thread is signaled and no data is available to be read from the comple‐
987       tion  queue,  returns  -FI_EAGAIN.   If the topmost completion is for a
988       failed transfer (an error entry), returns -FI_EAVAIL.
989
990   fi_cq_readerr
991       : On success, returns the positive value 1 (number of error entries re‐
992       turned).   On  error,  returns a negative fabric errno, with this error
993       explicitly identified: If no error completions are  available  to  read
994       from the CQ, returns -FI_EAGAIN.
995
996   fi_cq_strerror
997       :  Returns  a  character string interpretation of the provider specific
998       error returned with a completion.
999
1000       Fabric errno values are defined in rdma/fi_errno.h.
1001

SEE ALSO

1003       fi_getinfo(3),  fi_endpoint(3),  fi_domain(3),  fi_eq(3),   fi_cntr(3),
1004       fi_poll(3)
1005

AUTHORS

1007       OpenFabrics.
1008
1009
1010
1011Libfabric Programmer’s Manual     2023-03-07                          fi_cq(3)
Impressum