1Slurm API(3)              Slurm checkpoint functions              Slurm API(3)
2
3
4

NAME

6       slurm_checkpoint_able, slurm_checkpoint_complete, slurm_checkpoint_cre‐
7       ate,  slurm_checkpoint_disable,  slurm_checkpoint_enable,  slurm_check‐
8       point_error,  slurm_checkpoint_restart, slurm_checkpoint_vacate - Slurm
9       checkpoint functions
10
11

SYNTAX

13       #include <slurm/slurm.h>
14
15       int slurm_checkpoint_able (
16            uint32_t job_id,
17            uint32_t step_id,
18            time_t *start_time,
19       );
20
21       int slurm_checkpoint_complete (
22            uint32_t job_id,
23            uint32_t step_id,
24            time_t start_time,
25            uint32_t error_code,
26            char *error_msg
27       );
28
29       int slurm_checkpoint_create (
30            uint32_t job_id,
31            uint32_t step_id,
32            uint16_t max_wait,
33            char *image_dir
34       );
35
36       int slurm_checkpoint_disable (
37            uint32_t job_id,
38            uint32_t step_id
39       );
40
41       int slurm_checkpoint_enable (
42            uint32_t job_id,
43            uint32_t step_id
44       );
45
46       int slurm_checkpoint_error (
47
48            uint32_t job_id,
49            uint32_t step_id,
50            uint32_t *error_code,
51            char ** error_msg
52       );
53
54       int slurm_checkpoint_restart (
55            uint32_t job_id,
56            uint32_t step_id,
57            uint16_t stick,
58            char *image_dir
59       );
60
61       int slurm_checkpoint_tasks (
62            uint32_t job_id,
63            uint32_t step_id,
64            time_t begin_time,
65            char *image_dir,
66            uint16_t max_wait,
67            char *nodelist
68       );
69
70       int slurm_checkpoint_vacate (
71            uint32_t job_id,
72            uint32_t step_id,
73            uint16_t max_wait,
74            char *image_dir
75       );
76
77

ARGUMENTS

79       begin_time
80              When to begin the operation.
81
82       error_code
83              Error code for checkpoint operation. Only the highest  value  is
84              preserved.
85
86       error_msg
87              Error message for checkpoint operation. Only the error_msg value
88              for the highest error_code is preserved.
89
90       image_dir
91              Directory specification for where the checkpoint file should  be
92              read  from  or written to. The default value is specified by the
93              JobCheckpointDir Slurm configuration parameter.
94
95       job_id Slurm job ID to perform the operation upon.
96
97       max_wait
98              Maximum time to allow for the operation to complete in seconds.
99
100       nodelist
101              Nodes to send the request.
102
103       start_time
104              Time at which last checkpoint operation  began  (if  one  is  in
105              progress), otherwise zero.
106
107       step_id
108              Slurm  job step ID to perform the operation upon.  May be NO_VAL
109              if the operation is to be performed on all steps of  the  speci‐
110              fied job.  Specify SLURM_BATCH_SCRIPT to checkpoint a batch job.
111
112       stick  If  non-zero  then restart the job on the same nodes that it was
113              checkpointed from.
114
115

DESCRIPTION

117       slurm_checkpoint_able Report if checkpoint operations can presently  be
118       issued  for  the specified job step.  If yes, returns SLURM_SUCCESS and
119       sets start_time if checkpoint operation is  presently  active.  Returns
120       ESLURM_DISABLED if checkpoint operation is disabled.
121
122       slurm_checkpoint_complete  Note  that  a  requested checkpoint has been
123       completed.
124
125       slurm_checkpoint_create Request a checkpoint  for  the  identified  job
126       step.  Continue its execution upon completion of the checkpoint.
127
128       slurm_checkpoint_disable   Make  the  identified  job  step  non-check‐
129       pointable.  This can be issued as needed to prevent checkpointing while
130       a job step is in a critical section or for other reasons.
131
132       slurm_checkpoint_enable Make the identified job step checkpointable.
133
134       slurm_checkpoint_error  Get error information about the last checkpoint
135       operation for a given job step.
136
137       slurm_checkpoint_restart Request that  a  previously  checkpointed  job
138       resume  execution.   It  may continue execution on different nodes than
139       were originally used.  Execution may be delayed if  resources  are  not
140       immediately available.
141
142       slurm_checkpoint_vacate  Request  a  checkpoint  for the identified job
143       step.  Terminate its execution upon completion of the checkpoint.
144
145
146

RETURN VALUE

148       Zero is returned upon success.  On error, -1 is returned, and the Slurm
149       error code is set appropriately.
150

ERRORS

152       ESLURM_INVALID_JOB_ID the requested job or job step id does not exist.
153
154       ESLURM_ACCESS_DENIED  the  requesting  user lacks authorization for the
155       requested action (e.g. trying to delete or modify another user's job).
156
157       ESLURM_JOB_PENDING the requested job is still pending.
158
159       ESLURM_ALREADY_DONE the requested job has already completed.
160
161       ESLURM_DISABLED the requested operation has been disabled for this  job
162       step.   This  will  occur  when a request for checkpoint is issued when
163       they have been disabled.
164
165       ESLURM_NOT_SUPPORTED the requested operation is not supported  on  this
166       system.
167
168

EXAMPLE

170       #include <stdio.h>
171       #include <stdlib.h>
172       #include <slurm/slurm.h>
173       #include <slurm/slurm_errno.h>
174
175       int main (int argc, char *argv[])
176       {
177            uint32_t job_id, step_id;
178
179            if (argc < 3) {
180                 printf("Usage: %s job_id step_id\n", argv[0]);
181                 exit(1);
182            }
183
184            job_id = atoi(argv[1]);
185            step_id = atoi(argv[2]);
186            if (slurm_checkpoint_disable(job_id, step_id)) {
187                 slurm_perror ("slurm_checkpoint_error:");
188                 exit (1);
189            }
190            exit (0);
191       }
192
193

NOTE

195       These  functions  are  included  in the libslurm library, which must be
196       linked to your process for use (e.g. "cc -lslurm myprog.c").
197
198

COPYING

200       Copyright (C) 2004-2007 The Regents of the  University  of  California.
201       Copyright (C) 2008-2009 Lawrence Livermore National Security.  Produced
202       at   Lawrence   Livermore   National   Laboratory   (cf,   DISCLAIMER).
203       CODE-OCEC-09-009. All rights reserved.
204
205       This  file  is  part  of  Slurm,  a  resource  management program.  For
206       details, see <https://slurm.schedmd.com/>.
207
208       Slurm is free software; you can redistribute it and/or modify it  under
209       the  terms  of  the GNU General Public License as published by the Free
210       Software Foundation; either version 2  of  the  License,  or  (at  your
211       option) any later version.
212
213       Slurm  is  distributed  in the hope that it will be useful, but WITHOUT
214       ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY  or
215       FITNESS  FOR  A PARTICULAR PURPOSE.  See the GNU General Public License
216       for more details.
217
218

SEE ALSO

220       srun(1), squeue(1), free(3), slurm.conf(5)
221
222
223
224April 2015                Slurm checkpoint functions              Slurm API(3)
Impressum