1Slurm API(3) Slurm checkpoint functions Slurm API(3)
2
3
4
6 slurm_checkpoint_able, slurm_checkpoint_complete, slurm_checkpoint_cre‐
7 ate, slurm_checkpoint_disable, slurm_checkpoint_enable, slurm_check‐
8 point_error, slurm_checkpoint_restart, slurm_checkpoint_vacate - Slurm
9 checkpoint functions
10
11
13 #include <slurm/slurm.h>
14
15 int slurm_checkpoint_able (
16 uint32_t job_id,
17 uint32_t step_id,
18 time_t *start_time,
19 );
20
21 int slurm_checkpoint_complete (
22 uint32_t job_id,
23 uint32_t step_id,
24 time_t start_time,
25 uint32_t error_code,
26 char *error_msg
27 );
28
29 int slurm_checkpoint_create (
30 uint32_t job_id,
31 uint32_t step_id,
32 uint16_t max_wait,
33 char *image_dir
34 );
35
36 int slurm_checkpoint_disable (
37 uint32_t job_id,
38 uint32_t step_id
39 );
40
41 int slurm_checkpoint_enable (
42 uint32_t job_id,
43 uint32_t step_id
44 );
45
46 int slurm_checkpoint_error (
47
48 uint32_t job_id,
49 uint32_t step_id,
50 uint32_t *error_code,
51 char ** error_msg
52 );
53
54 int slurm_checkpoint_restart (
55 uint32_t job_id,
56 uint32_t step_id,
57 uint16_t stick,
58 char *image_dir
59 );
60
61 int slurm_checkpoint_tasks (
62 uint32_t job_id,
63 uint32_t step_id,
64 time_t begin_time,
65 char *image_dir,
66 uint16_t max_wait,
67 char *nodelist
68 );
69
70 int slurm_checkpoint_vacate (
71 uint32_t job_id,
72 uint32_t step_id,
73 uint16_t max_wait,
74 char *image_dir
75 );
76
77
79 begin_time
80 When to begin the operation.
81
82 error_code
83 Error code for checkpoint operation. Only the highest value is
84 preserved.
85
86 error_msg
87 Error message for checkpoint operation. Only the error_msg value
88 for the highest error_code is preserved.
89
90 image_dir
91 Directory specification for where the checkpoint file should be
92 read from or written to. The default value is specified by the
93 JobCheckpointDir Slurm configuration parameter.
94
95 job_id Slurm job ID to perform the operation upon.
96
97 max_wait
98 Maximum time to allow for the operation to complete in seconds.
99
100 nodelist
101 Nodes to send the request.
102
103 start_time
104 Time at which last checkpoint operation began (if one is in
105 progress), otherwise zero.
106
107 step_id
108 Slurm job step ID to perform the operation upon. May be NO_VAL
109 if the operation is to be performed on all steps of the speci‐
110 fied job. Specify SLURM_BATCH_SCRIPT to checkpoint a batch job.
111
112 stick If non-zero then restart the job on the same nodes that it was
113 checkpointed from.
114
115
117 slurm_checkpoint_able Report if checkpoint operations can presently be
118 issued for the specified job step. If yes, returns SLURM_SUCCESS and
119 sets start_time if checkpoint operation is presently active. Returns
120 ESLURM_DISABLED if checkpoint operation is disabled.
121
122 slurm_checkpoint_complete Note that a requested checkpoint has been
123 completed.
124
125 slurm_checkpoint_create Request a checkpoint for the identified job
126 step. Continue its execution upon completion of the checkpoint.
127
128 slurm_checkpoint_disable Make the identified job step non-check‐
129 pointable. This can be issued as needed to prevent checkpointing while
130 a job step is in a critical section or for other reasons.
131
132 slurm_checkpoint_enable Make the identified job step checkpointable.
133
134 slurm_checkpoint_error Get error information about the last checkpoint
135 operation for a given job step.
136
137 slurm_checkpoint_restart Request that a previously checkpointed job
138 resume execution. It may continue execution on different nodes than
139 were originally used. Execution may be delayed if resources are not
140 immediately available.
141
142 slurm_checkpoint_vacate Request a checkpoint for the identified job
143 step. Terminate its execution upon completion of the checkpoint.
144
145
146
148 Zero is returned upon success. On error, -1 is returned, and the Slurm
149 error code is set appropriately.
150
152 ESLURM_INVALID_JOB_ID the requested job or job step id does not exist.
153
154 ESLURM_ACCESS_DENIED the requesting user lacks authorization for the
155 requested action (e.g. trying to delete or modify another user's job).
156
157 ESLURM_JOB_PENDING the requested job is still pending.
158
159 ESLURM_ALREADY_DONE the requested job has already completed.
160
161 ESLURM_DISABLED the requested operation has been disabled for this job
162 step. This will occur when a request for checkpoint is issued when
163 they have been disabled.
164
165 ESLURM_NOT_SUPPORTED the requested operation is not supported on this
166 system.
167
168
170 #include <stdio.h>
171 #include <stdlib.h>
172 #include <slurm/slurm.h>
173 #include <slurm/slurm_errno.h>
174
175 int main (int argc, char *argv[])
176 {
177 uint32_t job_id, step_id;
178
179 if (argc < 3) {
180 printf("Usage: %s job_id step_id\n", argv[0]);
181 exit(1);
182 }
183
184 job_id = atoi(argv[1]);
185 step_id = atoi(argv[2]);
186 if (slurm_checkpoint_disable(job_id, step_id)) {
187 slurm_perror ("slurm_checkpoint_error:");
188 exit (1);
189 }
190 exit (0);
191 }
192
193
195 These functions are included in the libslurm library, which must be
196 linked to your process for use (e.g. "cc -lslurm myprog.c").
197
198
200 Copyright (C) 2004-2007 The Regents of the University of California.
201 Copyright (C) 2008-2009 Lawrence Livermore National Security. Produced
202 at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
203 CODE-OCEC-09-009. All rights reserved.
204
205 This file is part of Slurm, a resource management program. For
206 details, see <https://slurm.schedmd.com/>.
207
208 Slurm is free software; you can redistribute it and/or modify it under
209 the terms of the GNU General Public License as published by the Free
210 Software Foundation; either version 2 of the License, or (at your
211 option) any later version.
212
213 Slurm is distributed in the hope that it will be useful, but WITHOUT
214 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
215 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
216 for more details.
217
218
220 srun(1), squeue(1), free(3), slurm.conf(5)
221
222
223
224April 2015 Slurm checkpoint functions Slurm API(3)