1Slurm API(3) Slurm job step launch functions Slurm API(3)
2
3
4
6 slurm_step_launch_params_t_init, slurm_step_launch,
7 slurm_step_launch_fwd_signal, slurm_step_launch_wait_start,
8 slurm_step_launch_wait_finish, slurm_step_launch_abort - Slurm job step
9 launch functions
10
11
13 #include <slurm/slurm.h>
14
15 void slurm_step_launch_params_t_init (
16 slurm_step_launch_params_t *launch_req
17 );
18
19 int slurm_step_launch (
20 slurm_step_ctx ctx,
21 const slurm_step_launch_params_t *launch_req,
22 const slurm_step_launch_callbacks_t callbacks
23 );
24
25 void slurm_step_launch_fwd_signal (
26 slurm_step_ctx ctx,
27 int signo
28 );
29
30 int slurm_step_launch_wait_start (
31 slurm_step_ctx ctx
32 );
33
34 void slurm_step_launch_wait_finish (
35 slurm_step_ctx ctx
36 );
37
38 void slurm_step_launch_abort {
39 slurm_step_ctx ctx
40 );
41
42
44 callbacks
45 Identify functions to be called when various events occur.
46
47 ctx Job step context. Created by slurm_step_ctx_create, used in sub‐
48 sequent function calls, and destroyed by slurm_step_ctx_destroy.
49
50 launch_req
51 Pointer to a structure allocated by the user containing specifi‐
52 cations of the job step to be launched.
53
54
56 slurm_step_launch_params_t_init initialize a user-allocated
57 slurm_step_launch_params_t structure with default values. default val‐
58 ues. This function will NOT allocate any new memory.
59
60 slurm_step_launch Launch a parallel job step.
61
62 slurm_step_launch_fwd_signal Forward a signal to all those nodes with
63 running tasks.
64
65 slurm_step_launch_wait_start Block until all tasks have started.
66
67 slurm_step_launch_wait_finish Block until all tasks have finished (or
68 failed to start altogether).
69
70 slurm_step_launch_abort Abort an in-progress launch, or terminate the
71 fully launched job step. Can be called from a signal handler.
72
73
75 Use the local_fds entry in slurm_step_launch_params_t to specify file
76 descriptors to be used for standard input, output and error. Any
77 local_fds not specified will result in the launched tasks using the
78 calling process's standard input, output and error. Threads created by
79 slurm_step_launch will completely handle copying data between the
80 remote processes and the specified local file descriptors.
81
82 Use the substructure in slurm_step_io_fds_t to restrict the redirection
83 of I/O to a specific node or task ID. For example, to redirect standard
84 output only from task 0, set
85
86 params.local_fs.out.taskid=0;
87
88 Use the remote_*_filename fields in slurm_step_launch_params_t to have
89 launched tasks read and/or write directly to local files rather than
90 transferring data over the network to the calling process. These
91 strings support many of the same format options as the srun command.
92 Any remote_*_filename fields set will supersede the corresponding
93 local_fds entries. For example, the following code will direct each
94 task to write standard output and standard error to local files with
95 names containing the task ID (e.g. "/home/bob/test_output/run1.out.0"
96 and "/home/bob/test_output/run.1.err.0" for task 0).
97
98 params.remote_output_filename = "/home/bob/test_output/run1.out.%t"
99 params.remote_error_filename = "/home/bob/test_output/run1.err.%t"
100
101
103 slurm_step_launch and slurm_step_launch_wait_start will return
104 SLURM_SUCCESS when all tasks have successfully started, or SLURM_ERROR
105 if the job step is aborted during launch.
106
107
109 EINVAL Invalid argument
110
111 SLURM_PROTOCOL_VERSION_ERROR Protocol version has changed, re-link your
112 code.
113
114 ESLURM_INVALID_JOB_ID the requested job id does not exist.
115
116 ESLURM_ALREADY_DONE the specified job has already completed and can not
117 be modified.
118
119 ESLURM_ACCESS_DENIED the requesting user lacks authorization for the
120 requested action (e.g. trying to delete or modify another user's job).
121
122 ESLURM_INTERCONNECT_FAILURE failed to configure the node interconnect.
123
124 ESLURM_BAD_DIST task distribution specification is invalid.
125
126 SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT Timeout in communicating with Slurm
127 controller.
128
129
131 /*
132 * To compile:
133 * gcc test.c -o test -g -pthread -lslurm
134 *
135 * Or if Slurm is not in your default search paths:
136 * gcc test.c -o test -g -pthread -I{$SLURM_DIR}/include \
137 * -Wl,--rpath={$SLURM_DIR}/lib -L{$SLURM_DIR}/lib -lslurm
138 */
139 #include <stdio.h>
140 #include <stdlib.h>
141 #include <string.h>
142 #include <slurm/slurm.h>
143 #include <slurm/slurm_errno.h>
144
145 static void _task_start(launch_tasks_response_msg_t *msg)
146 {
147 printf("%d tasks started on node %s\n",
148 msg->count_of_pids, msg->node_name);
149 }
150
151 static void _task_finish(task_exit_msg_t *msg)
152 {
153 printf("%d tasks finished\n", msg->num_tasks);
154 }
155
156 int main (int argc, char *argv[])
157 {
158 slurm_step_ctx_params_t step_params;
159 slurm_step_ctx step_ctx;
160 slurm_step_launch_params_t params;
161 slurm_step_launch_callbacks_t callbacks;
162 uint32_t job_id, step_id;
163
164 slurm_step_ctx_params_t_init(&step_params);
165 step_params.node_count = 1;
166 step_params.task_count = 4;
167 step_params.overcommit = true;
168
169 step_ctx = slurm_step_ctx_create(&step_params);
170 if (step_ctx == NULL) {
171 slurm_perror("slurm_step_ctx_create");
172 exit(1);
173 }
174 slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id);
175 slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id);
176 printf("Ready to start job %u step %u\n", job_id, step_id);
177
178 slurm_step_launch_params_t_init(¶ms);
179 params.argc = argc - 1;
180 params.argv = argv + 1;
181 callbacks.task_start = _task_start;
182 callbacks.task_finish = _task_finish;
183 if (slurm_step_launch(step_ctx, NULL, ¶ms, &callbacks)
184 != SLURM_SUCCESS) {
185 slurm_perror("slurm_step_launch");
186 exit(1);
187 }
188 printf("Sent step launch RPC\n");
189
190 if (slurm_step_launch_wait_start(step_ctx) != SLURM_SUCCESS) {
191 fprintf(stderr, "job step was aborted during launch\n");
192 } else {
193 printf("All tasks have started\n");
194 }
195
196 slurm_step_launch_wait_finish(step_ctx);
197 printf("All tasks have finished\n");
198
199 slurm_step_ctx_destroy(step_ctx);
200 exit(0);
201 }
202
203
205 These functions are included in the libslurm library, which must be
206 linked to your process for use (e.g. "cc -lslurm myprog.c").
207
208
210 Copyright (C) 2006-2007 The Regents of the University of California.
211 Copyright (C) 2008 Lawrence Livermore National Security. Produced at
212 Lawrence Livermore National Laboratory (cf, DISCLAIMER).
213 CODE-OCEC-09-009. All rights reserved.
214
215 This file is part of Slurm a resource management program. For
216 details, see <https://slurm.schedmd.com/>.
217
218 Slurm is free software; you can redistribute it and/or modify it under
219 the terms of the GNU General Public License as published by the Free
220 Software Foundation; either version 2 of the License, or (at your
221 option) any later version.
222
223 Slurm is distributed in the hope that it will be useful, but WITHOUT
224 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
225 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
226 for more details.
227
229 slurm_step_ctx_create(3), slurm_step_ctx_destroy(3),
230 slurm_get_errno(3), slurm_perror(3), slurm_strerror(3), salloc(1),
231 srun(1)
232
233
234
235April 2015 Slurm job step launch functions Slurm API(3)