1NUMA(3)                    Linux Programmer's Manual                   NUMA(3)
2
3
4

NAME

6       numa - NUMA policy library
7

SYNOPSIS

9       #include <numa.h>
10
11       cc ... -lnuma
12
13       int numa_available(void);
14
15       int numa_max_node(void);
16       int numa_preferred(void);
17       long numa_node_size(int node, long *freep);
18       long long numa_node_size64(int node, long long *freep);
19
20       nodemask_t numa_all_nodes;
21       nodemask_t numa_no_nodes;
22       int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen);
23
24       void nodemask_zero(nodemask_t *mask);
25       void nodemask_set(nodemask_t *mask, int node);
26       void nodemask_clr(nodemask_t *mask, int node);
27       int nodemask_isset(const nodemask_t *mask, int node);
28       int nodemask_equal(const nodemask_t *a, const nodemask_t b);
29
30       void numa_set_interleave_mask(nodemask_t *nodemask);
31       nodemask_t numa_get_interleave_mask(void);
32       void numa_bind(nodemask_t *nodemask);
33       void numa_set_preferred(int node);
34       void numa_set_localalloc(int flag);
35       void numa_set_membind(nodemask_t *nodemask);
36       nodemask_t numa_get_membind(void);
37
38       void *numa_alloc_interleaved_subset(size_t size, nodemask_t *nodemask);
39       void *numa_alloc_interleaved(size_t size);
40       void *numa_alloc_onnode(size_t size, int node);
41       void *numa_alloc_local(size_t size);
42       void *numa_alloc(size_t size);
43       void numa_free(void *start, size_t size);
44
45       int numa_run_on_node_mask(nodemask_t *nodemask);
46       int numa_run_on_node(int node);
47       int numa_get_run_node_mask(void);
48
49       void numa_interleave_memory(void *start, size_t size, nodemask_t *node‐
50       mask);
51       void numa_tonode_memory(void *start, size_t size, int node);
52       void numa_tonodemask_memory(void *start, size_t size, nodemask_t *node‐
53       mask);
54       void numa_setlocal_memory(void *start, size_t size);
55       void numa_police_memory(void *start, size_t size);
56       int numa_distance(int node1, int node2);
57       void numa_set_bind_policy(int strict);
58       void numa_set_strict(int strict);
59       void numa_error(char *where);
60       void numa_warn(int number, char *where, ...);
61       extern int numa_exit_on_error;
62

DESCRIPTION

64       The  libnuma  library offers a simple programming interface to the NUMA
65       (Non Uniform Memory Access) policy supported by the Linux kernel. On  a
66       NUMA architecture some memory areas have different latency or bandwidth
67       than others.
68
69       Available policies are page interleaving (i.e., allocate  in  a  round-
70       robin  fashion from all, or a subset, of the nodes on the system), pre‐
71       ferred node allocation  (i.e.,  preferably  allocate  on  a  particular
72       node), local allocation (i.e., allocate on the node on which the thread
73       is currently executing), or allocation only on  specific  nodes  (i.e.,
74       allocate  on  some subset of the available nodes).  It is also possible
75       to bind threads to specific nodes.
76
77       Numa memory allocation policy is a per-thread attribute, but is  inher‐
78       ited by children.
79
80       For  setting a specific policy globally for all memory allocations in a
81       process and its children it is easiest to start it with the  numactl(8)
82       utility. For more finegrained policy inside an application this library
83       can be used.
84
85       All numa memory allocation policy only takes  effect  when  a  page  is
86       actually  faulted  into the address space of a process by accessing it.
87       The numa_alloc_* functions take care of this automatically.
88
89       A node is defined as an area where all memory has  the  same  speed  as
90       seen  from  a  particular CPU. A node can contain multiple CPUs. Caches
91       are ignored for this definition.
92
93       This library is only concerned about nodes and their  memory  and  does
94       not   deal   with  individual  CPUs  inside  these  nodes  (except  for
95       numa_node_to_cpus )
96
97       Before any other calls in this library  can  be  used  numa_available()
98       must  be  called. If it returns -1, all other functions in this library
99       are undefined.
100
101       numa_max_node() returns the highest node number available on  the  cur‐
102       rent  system.  If a node number or a node mask with a bit set above the
103       value returned by this function is passed to a  libnuma  function,  the
104       result is undefined.
105
106       numa_node_size()  returns  the  memory  size of a node. If the argument
107       freep is not NULL, it used to return the amount of free memory  on  the
108       node.   On  error  it returns -1.  numa_node_size64() works the same as
109       numa_node_size() except that it returns values as long long instead  of
110       long.  This is useful on 32-bit architectures with large nodes.
111
112       Some  of  these  functions accept or return a nodemask.  A nodemask has
113       type nodemask_t.  It is an abstract bitmap type containing a bit set of
114       nodes.  The maximum node number depends on the architecture, but is not
115       larger than numa_max_node().  What happens in libnuma calls  when  bits
116       above  numa_max_node()  are  passed  is undefined.  A nodemask_t should
117       only be manipulated with  the  nodemask_zero(),  nodemask_clr(),  node‐
118       mask_isset(),  and  nodemask_set() functions.  nodemask_zero() clears a
119       nodemask_t.  nodemask_isset() returns true if node is set in the passed
120       nodemask.  nodemask_clr() clears node in nodemask.  nodemask_set() sets
121       node in nodemask.   The  predefined  variable  numa_all_nodes  has  all
122       available  nodes set; numa_no_nodes is the empty set.  nodemask_equal()
123       returns non-zero if its two nodeset arguments are equal.
124
125       numa_preferred() returns the preferred  node  of  the  current  thread.
126       This  is  the  node  on  which  the kernel preferably allocates memory,
127       unless some other policy overrides this.
128
129       numa_set_interleave_mask() sets the memory interleave mask for the cur‐
130       rent  thread  to  nodemask.  All new memory allocations are page inter‐
131       leaved over all nodes in  the  interleave  mask.  Interleaving  can  be
132       turned  off  again  by passing an empty mask (numa_no_nodes).  The page
133       interleaving only occurs on the actual page fault that puts a new  page
134       into the current address space. It is also only a hint: the kernel will
135       fall back to other nodes if no memory is available  on  the  interleave
136       target.  This is a low level function, it may be more convenient to use
137       the   higher   level   functions   like   numa_alloc_interleaved()   or
138       numa_alloc_interleaved_subset().
139
140       numa_get_interleave_mask() returns the current interleave mask.
141
142       numa_bind()  binds  the  current  thread  and its children to the nodes
143       specified in nodemask.  They will only run on the CPUs of the specified
144       nodes  and only be able to allocate memory from them.  This function is
145       equivalent  to  calling  numa_run_on_node_mask(nodemask)  followed   by
146       numa_set_membind(nodemask).   If  threads should be bound to individual
147       CPUs  inside   nodes   consider   using   numa_node_to_cpus   and   the
148       sched_setaffinity(2) syscall.
149
150
151       numa_set_preferred()  sets the preferred node for the current thread to
152       node.  The preferred node is the node on  which  memory  is  preferably
153       allocated  before  falling  back to other nodes.  The default is to use
154       the node on which the process  is  currently  running  (local  policy).
155       Passing a -1 argument is equivalent to numa_set_localalloc().
156
157       numa_set_localalloc()  sets  a  local  memory allocation policy for the
158       calling thread.  Memory is preferably allocated on the  node  on  which
159       the thread is currently running.
160
161       numa_set_membind()  sets  the  memory allocation mask.  The thread will
162       only allocate memory from the nodes set in nodemask.  Passing an  argu‐
163       ment  of  numa_no_nodes  or  numa_all_nodes turns off memory binding to
164       specific nodes.
165
166       numa_get_membind() returns the mask of nodes from which memory can cur‐
167       rently be allocated.  If the returned mask is equal to numa_no_nodes or
168       numa_all_nodes, then all nodes are available for memory allocation.
169
170       numa_alloc_interleaved() allocates size bytes  of  memory  page  inter‐
171       leaved  on  all nodes. This function is relatively slow and should only
172       be used for large areas consisting of multiple pages. The  interleaving
173       works  at  page  level  and  will  only show an effect when the area is
174       large.  The allocated memory must be freed with numa_free().  On error,
175       NULL is returned.
176
177       numa_alloc_interleaved_subset() is like numa_alloc_interleaved() except
178       that it also accepts a mask of the nodes to interleave on.   On  error,
179       NULL is returned.
180
181       numa_alloc_onnode()  allocates memory on a specific node. This function
182       is relatively slow and allocations are rounded up to  the  system  page
183       size.   The  memory  must be freed with numa_free().  On errors NULL is
184       returned.
185
186       numa_alloc_local() allocates size bytes of memory on  the  local  node.
187       This  function is relatively slow and allocations are rounded up to the
188       system page size.  The memory  must  be  freed  with  numa_free().   On
189       errors NULL is returned.
190
191       numa_alloc()  allocates size bytes of memory with the current NUMA pol‐
192       icy.  This function is relatively slow and allocations are  rounded  up
193       to  the  system  page size.  The memory must be freed with numa_free().
194       On errors NULL is returned.
195
196       numa_free() frees size bytes of memory starting at start, allocated  by
197       the numa_alloc_* functions above.
198
199       numa_run_on_node()  runs  the current thread and its children on a spe‐
200       cific node. They will not migrate to CPUs of other nodes until the node
201       affinity  is reset with a new call to numa_run_on_node_mask().  Passing
202       -1 permits the kernel to schedule on all nodes again.  On success, 0 is
203       returned;  on  error  -1  is returned, and errno is set to indicate the
204       error.
205
206       numa_run_on_node_mask() runs the current thread and its  children  only
207       on nodes specified in nodemask.  They will not migrate to CPUs of other
208       nodes  until  the  node  affinity  is  reset  with  a   new   call   to
209       numa_run_on_node_mask().   Passing numa_all_nodes permits the kernel to
210       schedule on all nodes again.  On success, 0 is returned; on error -1 is
211       returned, and errno is set to indicate the error.
212
213       numa_get_run_node_mask()  returns  the  mask  of nodes that the current
214       thread is allowed to run on.
215
216       numa_interleave_memory() interleaves size bytes of memory page by  page
217       from start on nodes nodemask.  This is a lower level function to inter‐
218       leave not yet faulted in but  allocated  memory.  Not  yet  faulted  in
219       means  the  memory  is allocated using mmap(2) or shmat(2), but has not
220       been accessed by the current process yet. The  memory  is  page  inter‐
221       leaved  to all nodes specified in nodemask.  Normally numa_alloc_inter‐
222       leaved() should be used for private memory instead, but  this  function
223       is  useful  to handle shared memory areas. To be useful the memory area
224       should be several megabytes at least (or tens of megabytes of hugetlbfs
225       mappings) If the numa_set_strict() flag is true then the operation will
226       cause a numa_error if there were already pages in the mapping  that  do
227       not follow the policy.
228
229       numa_tonode_memory()  put  memory  on  a specific node. The constraints
230       described for numa_interleave_memory() apply here too.
231
232       numa_tonodemask_memory() put memory on a specific  set  of  nodes.  The
233       constraints described for numa_interleave_memory() apply here too.
234
235       numa_setlocal_memory()  locates  memory  on  the current node. The con‐
236       straints described for numa_interleave_memory() apply here too.
237
238       numa_police_memory() locates memory with the current NUMA  policy.  The
239       constraints described for numa_interleave_memory() apply here too.
240
241       numa_node_to_cpus()  converts  a  node number to a bitmask of CPUs. The
242       user must pass a long enough buffer. If the buffer is not  long  enough
243       errno will be set to ERANGE and -1 returned. On success 0 is returned.
244
245       numa_set_bind_policy()  specifies  whether  calls that bind memory to a
246       specific node should use the preferred policy or a strict policy.   The
247       preferred  policy  allows  the kernel to allocate memory on other nodes
248       when there isn't enough free on the target node. strict will  fail  the
249       allocation  in  that case.  Setting the argument to specifies strict, 0
250       preferred.  Note that specifying more than one node non strict may only
251       use the first node in some kernel versions.
252
253       numa_set_strict()  sets a flag that says whether the functions allocat‐
254       ing on specific nodes should use use a strict policy. Strict means  the
255       allocation  will  fail  if the memory cannot be allocated on the target
256       node.  Default operation is to fall back to other nodes.  This  doesn't
257       apply to interleave and default.
258
259       numa_distance()  reports  the  distance in the machine topology between
260       two nodes.  The factors are a multiple of 10. It  returns  0  when  the
261       distance  cannot  be  determined.  A  node  has  distance 10 to itself.
262       Reporting the distance requires a Linux kernel  version  of  2.6.10  or
263       newer.
264
265       numa_error() is a weak internal libnuma function that can be overridden
266       by the user program.  This function is called with a  char  *  argument
267       when  a libnuma function fails.  Overriding the weak library definition
268       makes it possible to specify a different error handling strategy when a
269       libnuma function fails. It does not affect numa_available().
270
271       The  num_error()  function defined in libnuma prints an error on stderr
272       and terminates the program if numa_exit_on_error is set to  a  non-zero
273       value.  The default value of numa_exit_on_error is zero.
274
275       numa_warn()  is a weak internal libnuma function that can be also over‐
276       ridden by the user program.  It is called to warn the user when a  lib‐
277       numa function encounters a non-fatal error.  The default implementation
278       prints a warning to stderr.
279
280       The first argument is a unique number identifying each  warning.  After
281       that  there is a printf(3)-style format string and a variable number of
282       arguments.
283
284

THREAD SAFETY

286       numa_set_bind_policy and numa_exit_on_error  are  process  global.  The
287       other calls are thread safe.
288
289       Memory  policy  set  for  memory  areas is shared by all threads of the
290       process.  Memory policy is also shared by other processes  mapping  the
291       same  memory using shmat(2) or mmap(2) from shmfs/hugetlbfs.  It is not
292       shared for disk backed file mappings right now although that may change
293       in the future.
294
295
297       Copyright  2002, 2004, Andi Kleen, SuSE Labs.  libnuma is under the GNU
298       Lesser General Public License, v2.1.
299
300

SEE ALSO

302       get_mempolicy(2), getpagesize(2), mbind(2), mmap(2),  set_mempolicy(2),
303       shmat(2), numactl(8), sched_setaffinity(2)
304
305
306
307SuSE Labs                          May 2004                            NUMA(3)
Impressum