1TM(3)                      Library Functions Manual                      TM(3)
2
3
4

NAME

6       tm_init,  tm_nodeinfo,  tm_poll, tm_notify, tm_spawn, tm_kill, tm_obit,
7       tm_taskinfo,   tm_atnode,   tm_rescinfo,   tm_publish,    tm_subscribe,
8       tm_finalize - task management API
9

SYNOPSIS

11       #include <tm.h>
12
13       int tm_init(info, roots)
14      void ∗info;
15      struct tm_roots ∗roots;
16
17       int tm_nodeinfo(list, nnodes)
18      tm_node_id ***list;
19      int ∗nnodes;
20
21       int tm_poll(poll_event, result_event, wait, tm_errno)
22      tm_event_t poll_event;
23      tm_event_t ∗result_event;
24      int wait;
25      int ∗tm_errno;
26
27       int tm_notify(tm_signal)
28      int tm_signal;
29
30       int tm_spawn(argc, argv, envp, where, tid, event)
31      int argc;
32      char ∗∗argv;
33      char ∗∗envp;
34      tm_node_id where;
35      tm_task_id ∗tid;
36      tm_event_t ∗event;
37
38       int tm_kill(tid, sig, event)
39      tm_task_id tid;
40      int sig;
41      tm_event_t event;
42
43       int tm_obit(tid, obitval, event)
44      tm_task_id tid;
45      int ∗obitval;
46      tm_event_t event;
47
48       int tm_taskinfo(node, tid_list, list_size, ntasks, event)
49      tm_node_id node;
50      tm_task_id ∗tid_list;
51      int list_size;
52      int ∗ntasks;
53      tm_event_t event;
54
55       int tm_atnode(tid, node)
56      tm_task_id tid;
57      tm_node_id ∗node;
58
59       int tm_rescinfo(node, resource, len, event)
60      tm_node_id node;
61      char ∗resource;
62      int len;
63      tm_event_t event;
64
65       int tm_publish(name, info, len, event)
66      char ∗name;
67      void ∗info;
68      int len;
69      tm_event_t event;
70
71       int tm_subscribe(tid, name, info, len, info_len, event)
72      tm_task_id tid;
73      char ∗name;
74      void ∗info;
75      int len;
76      int ∗info_len;
77      tm_event_t event;
78
79       int tm_finalize()
80

DESCRIPTION

82       These functions provide a partial implementation of the task management
83       interface part of the PSCHED API.  In PBS, MOM provides the  task  man‐
84       ager  functions.  This library opens a tcp socket to the MOM running on
85       the local host and sends and receives messages.
86
87       The PSCHED Task Management API description used to create this  library
88       was  commited  to paper on Novermber 15, 1996 and was given the version
89       number 0.1.  Changes may have taken place since that time which are not
90       reflected in this library.
91
92       The  API  description uses several data types that it purposefully does
93       not define.  This was done so an implementaion would not be confined in
94       the  way  it was written.  For this specific work, the definitions fol‐
95       low:
96
97       typedef   int            tm_node_id;    /* job-relative node id */
98       #define   TM_ERROR_NODE  ((tm_node_id)-1)
99
100       typedef   int            tm_event_t;    /* event handle, > 0 for real events */
101       #define   TM_NULL_EVENT  ((tm_event_t)0)
102       #define   TM_ERROR_EVENT ((tm_event_t)-1)
103
104       typedef   unsigned long  tm_task_id;
105       #define   TM_NULL_TASK   (tm_task_id)0
106
107       There are a number of error values defined as well: TM_SUCCESS,
108       TM_ESYSTEM, TM_ENOEVENT, TM_ENOTCONNECTED, TM_EUNKNOWNCMD, TM_ENOTIM‐
109       PLEMENTED, TM_EBADENVIRONMENT, TM_ENOTFOUND.
110
111       tm_init() initializes the library by opening a socket to the MOM on the
112       local  host  and sending a TM_INIT message, then waiting for the reply.
113       The info paramenter has no use and is  included  to  conform  with  the
114       PSCHED  document.   The roots pointer will contain valid data after the
115       function returns and has the following structure:
116
117       struct    tm_roots {
118            tm_task_id     tm_me;
119            tm_task_id     tm_parent;
120            int       tm_nnodes;
121            int       tm_ntasks;
122            int       tm_taskpoolid;
123            tm_task_id     *tm_tasklist;
124       };
125
126
127       tm_me               The task id of this calling task.
128
129       tm_parent           The task id of the task which spawned this task  or
130                           TM_NULL_TASK  if  the  calling  task is the initial
131                           task started by PBS.
132
133       tm_nnodes           The number of nodes allocated to the job.
134
135       tm_ntasks           This will always be 0 for PBS.
136
137       tm_taskpoolid       PBS does not support task pools so this will always
138                           be -1.
139
140       tm_tasklist         This will be NULL for PBS.
141
142       The tm_ntasks, tm_taskpoolid and tm_tasklist fields are not filled with
143       data specified by the PSCHED document.  PBS does not support task pools
144       and,  at  this  time, does not return information about current running
145       tasks from tm_init.  There is a separate call to  get  information  for
146       current running tasks called tm_taskinfo which is described below.  The
147       return value from tm_init be TM_SUCCESS if the  library  initialization
148       was successful, or an error return otherwise.
149
150       tm_nodeinfo()  places a pointer to a malloc'ed array of tm_node_id's in
151       the pointer pointed at by list.  The order of the tm_node_id's in  list
152       is the same as that specified to MOM in the "exec_host" attribute.  The
153       int pointed to by nnodes contains the number of nodes allocated to  the
154       job.   This  is  information that is returned during initialization and
155       does not require communication with  MOM.   If  tm_init  has  not  been
156       called, TM_ESYSTEM is returned, otherwise TM_SUCCESS is returned.
157
158       tm_poll()  is  the  function  which will retrieve information about the
159       task management system  to  locations  specified  when  other  routines
160       request an action take place.  The bookkeeping for this is done by gen‐
161       erating an event for each action.  When the task manager (MOM) sends  a
162       message  that  an  action is complete, the event is reported by tm_poll
163       and information is placed where the caller requested it.  The  argument
164       poll_event  is  meant  to  be  used  to request a specific event.  This
165       implementation does not use it and it must be set to  TM_NULL_EVENT  or
166       an error is returned.  Upon return, the argument result_event will con‐
167       tain a valid event number or TM_ERROR_EVENT on error.  If wait is  zero
168       and   there   are   no   events  to  report,  result_event  is  set  to
169       TM_NULL_EVENT.  If wait is non-zero an there are no events  to  report,
170       the  function will block waiting for an event.  If no local error takes
171       place, TM_SUCCESS is returned.  If an error is reported by MOM  for  an
172       event, then the argument tm_errno will be set to an error code.
173
174       tm_notify() is described in the PSCHED documentation, but is not imple‐
175       mented for PBS yet.  It will return TM_ENOTIMPLEMENTED.
176
177       tm_spawn() sends a message to MOM to start a new task.  The node id  of
178       the  host to run the task is given by where.  The parameters argc, argv
179       and envp specify the program to run and its arguments  and  environment
180       very much like exec().  The full path of the program executable must be
181       given by argv[0] and the number of elements in the argv array is  given
182       by argc.  The array envp is NULL terminated.  The argument event points
183       to a tm_event_t variable which is filled in with an event number.  When
184       this  event  is  returned by tm_poll , the tm_task_id pointed to by tid
185       will contain the task id of the newly created task.  In  addition,  the
186       tid  is  available  to the process in the PBS_TASKNUM environment vari‐
187       able.  Similarly, the node number is in the  PBS_NODENUM  variable  and
188       the cpu number is in the PBS_VNODENUM variable.
189
190       tm_kill()  sends  a signal specified by sig to the task tid and puts an
191       event number in the tm_event_t pointed to by event.
192
193       tm_obit() creates an event which will be reported  when  the  task  tid
194       exits.   The  int  pointed to by obitval will contain the exit value of
195       the task when the event is reported.
196
197       tm_taskinfo() returns the list of tasks running on the  node  specified
198       by  node.   The  PSCHED  documentation  mentions  a  special ability to
199       retrieve all tasks running in the job.  This is not supported  by  PBS.
200       The argument tid_list points to an array of tm_task_id's which contains
201       list_size elements.  Upon return, event will contain an  event  number.
202       When  this  event  is polled, the int pointed to by ntasks will contain
203       the number of tasks running on the node and the array will be filled in
204       with tm_task_id's.  If ntasks is greater than list_size, only list_size
205       tasks will be returned.
206
207       tm_atnode() will place the node id where the task  tid  exists  in  the
208       tm_node_id pointed to by node.
209
210       tm_rescinfo()  makes  a  request  for a string specifying the resources
211       available on a node given by the argument node.  The string is returned
212       in the buffer pointed to by resource and is terminated by a NUL charac‐
213       ter unless the number of characters  of  information  is  greater  than
214       specified  by len.  The resource string PBS returns is formated as fol‐
215       lows:
216
217       A space separated set of strings from the uname system call followed by
218       a  colon  (:).  The order of the strings is sysname, nodename, release,
219       version, machine.
220
221       A  comma  spearated  set  of  strings  giving  the  components  of  the
222       "Resource_List"  attribute of the job.  Each component has the resource
223       name, an equal sign, and the limit value.
224
225       For example, a return for a task running on an  SGI  workstation  might
226       look like:
227
228       IRIX golum 6.2 03131015 IP22:cput=20:00,mem=400kb
229
230       tm_publish()  causes  len bytes of information pointed at by info to be
231       sent to the local MOM to be saved under the name given by name.
232
233       tm_subscribe() returns a copy of the information named by name for  the
234       task  given  by  tid.  The argument info points to a buffer of size len
235       where the information will be returned.  The argument info_len will  be
236       set  with  the  size of the published data.  If this is larger than the
237       supplied buffer, the data will have been truncated.
238
239       tm_finalize() may be called to free any memory in use  by  the  library
240       and close the connection to MOM.
241

SEE ALSO

243       pbs_mom,   PSCHED:   An   API   for  Parallel  Job/Resource  Managment,
244       http://parallel.nas.nasa.gov/Psched/psched-api-report.ps
245
246
247
248                                  21 May 1997                            TM(3)
Impressum