root/dmtcp_restart.cpp
/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- fd
- upid
- sid
- isRootOfProcessTree
- procname
- compGroup
- numPeers
- noCoordinator
- restoreGroup
- createDependentChildProcess
- createDependentNonChildProcess
- createProcess
- runMtcpRestart
- readCkptHeader
- first_char
- open_ckpt_to_read
- openCkptFileToRead
- setEnvironFd
- setNewCkptDir
- main
1 /****************************************************************************
2 * Copyright (C) 2006-2013 by Jason Ansel, Kapil Arya, and Gene Cooperman *
3 * jansel@csail.mit.edu, kapil@ccs.neu.edu, gene@ccs.neu.edu *
4 * *
5 * This file is part of DMTCP. *
6 * *
7 * DMTCP is free software: you can redistribute it and/or *
8 * modify it under the terms of the GNU Lesser General Public License as *
9 * published by the Free Software Foundation, either version 3 of the *
10 * License, or (at your option) any later version. *
11 * *
12 * DMTCP is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU Lesser General Public License for more details. *
16 * *
17 * You should have received a copy of the GNU Lesser General Public *
18 * License along with DMTCP:dmtcp/src. If not, see *
19 * <http://www.gnu.org/licenses/>. *
20 ****************************************************************************/
21
22 #include <stdio.h>
23 #include <sys/stat.h>
24 #include <sys/fcntl.h>
25 #include <sys/wait.h>
26 #include <sys/mman.h>
27 #include <limits.h>
28 #include <elf.h>
29 #include "config.h"
30 #ifdef HAS_PR_SET_PTRACER
31 # include <sys/prctl.h>
32 #endif
33
34 #include "constants.h"
35 #include "coordinatorapi.h"
36 #include "util.h"
37 #include "uniquepid.h"
38 #include "processinfo.h"
39 #include "shareddata.h"
40 #include "../jalib/jassert.h"
41 #include "../jalib/jfilesystem.h"
42
43 #define BINARY_NAME "dmtcp_restart"
44 #define MTCP_RESTART_BINARY "mtcp_restart"
45
46 using namespace dmtcp;
47
48 // Copied from mtcp/mtcp_restart.c.
49 #define DMTCP_MAGIC_FIRST 'D'
50 #define GZIP_FIRST 037
51 #ifdef HBICT_DELTACOMP
52 #define HBICT_FIRST 'H'
53 #endif
54
55 static void setEnvironFd();
56
57 string tmpDir = "/DMTCP/Uninitialized/Tmp/Dir";
58
59 // gcc-4.3.4 -Wformat=2 issues false positives for warnings unless the format
60 // string has at least one format specifier with corresponding format argument.
61 // Ubuntu 9.01 uses -Wformat=2 by default.
62 static const char* theUsage =
63 "Usage: dmtcp_restart [OPTIONS] <ckpt1.dmtcp> [ckpt2.dmtcp...]\n\n"
64 "Restart processes from a checkpoint image.\n\n"
65 "Connecting to the DMTCP Coordinator:\n"
66 " -h, --coord-host HOSTNAME (environment variable DMTCP_COORD_HOST)\n"
67 " Hostname where dmtcp_coordinator is run (default: localhost)\n"
68 " -p, --coord-port PORT_NUM (environment variable DMTCP_COORD_PORT)\n"
69 " Port where dmtcp_coordinator is run (default: 7779)\n"
70 " --port-file FILENAME\n"
71 " File to write listener port number.\n"
72 " (Useful with '--port 0', which is used to assign a random port)\n"
73 " -j, --join\n"
74 " Join an existing coordinator, raise error if one doesn't\n"
75 " already exist\n"
76 " --new-coordinator\n"
77 " Create a new coordinator at the given port. Fail if one\n"
78 " already exists on the given port. The port can be specified\n"
79 " with --coord-port, or with environment variable DMTCP_COORD_PORT.\n"
80 " If no port is specified, start coordinator at a random port\n"
81 " (same as specifying port '0').\n"
82 " -i, --interval SECONDS (environment variable DMTCP_CHECKPOINT_INTERVAL)\n"
83 " Time in seconds between automatic checkpoints.\n"
84 " 0 implies never (manual ckpt only); if not set and no env var,\n"
85 " use default value set in dmtcp_coordinator or dmtcp_command.\n"
86 " Not allowed if --join is specified\n"
87 "\n"
88 "Other options:\n"
89 " --run-as-root\n"
90 " Allow root to run dmtcp_restart and disable uid checking.\n"
91 " (default: disabled)\n"
92 " --no-strict-uid-checking\n"
93 " Disable uid checking for the checkpoint image. This allows\n"
94 " the checkpoint image to be restarted by a different user\n"
95 " than the one that created it.\n"
96 " (environment variable DMTCP_DISABLE_UID_CHECKING)\n"
97 " --ckptdir (environment variable DMTCP_CHECKPOINT_DIR):\n"
98 " Directory to store checkpoint images\n"
99 " (default: use the same directory used in previous checkpoint)\n"
100 " --tmpdir PATH (environment variable DMTCP_TMPDIR)\n"
101 " Directory to store temporary files (default: $TMDPIR or /tmp)\n"
102 " -q, --quiet (or set environment variable DMTCP_QUIET = 0, 1, or 2)\n"
103 " Skip NOTE messages; if given twice, also skip WARNINGs\n"
104 " --help\n"
105 " Print this message and exit.\n"
106 " --version\n"
107 " Print version information and exit.\n"
108 "\n"
109 HELP_AND_CONTACT_INFO
110 "\n"
111 ;
112
113 class RestoreTarget;
114
115 typedef map<UniquePid, RestoreTarget*> RestoreTargetMap;
116 RestoreTargetMap targets;
117 RestoreTargetMap independentProcessTreeRoots;
118 bool noStrictUIDChecking = false;
119 bool runAsRoot = false;
120 static string thePortFile;
121 CoordinatorMode allowedModes = COORD_ANY;
122
123 static void setEnvironFd();
124 static void runMtcpRestart(int is32bitElf, int fd, ProcessInfo *pInfo);
125 static int readCkptHeader(const string& path, ProcessInfo *pInfo);
126 static int openCkptFileToRead(const string& path);
127
128 class RestoreTarget
129 {
130 public:
131 RestoreTarget(const string& path)
132 : _path(path)
133 {
134 JASSERT(jalib::Filesystem::FileExists(_path)) (_path)
135 .Text ( "checkpoint file missing" );
136
137 _fd = readCkptHeader(_path, &_pInfo);
138 JTRACE("restore target") (_path) (_pInfo.numPeers()) (_pInfo.compGroup());
139 }
140
141 int fd() const { return _fd; }
142 const UniquePid& upid() const { return _pInfo.upid(); }
143 pid_t pid() const { return _pInfo.pid(); }
144 pid_t sid() const { return _pInfo.sid(); }
145 bool isRootOfProcessTree() const {
146 return _pInfo.isRootOfProcessTree();
147 }
148 string procname() { return _pInfo.procname(); }
149 UniquePid compGroup() { return _pInfo.compGroup(); }
150 int numPeers() { return _pInfo.numPeers(); }
151 bool noCoordinator() { return _pInfo.noCoordinator(); }
152
153 void restoreGroup()
154 {
155 if (_pInfo.isGroupLeader()) {
156 // create new Group where this process becomes a leader
157 JTRACE("Create new Group.");
158 setpgid(0, 0);
159 }
160 }
161
162 void createDependentChildProcess()
163 {
164 pid_t pid = fork();
165 JASSERT(pid != -1);
166 if (pid != 0) {
167 return;
168 }
169 createProcess();
170 }
171
172 void createDependentNonChildProcess()
173 {
174 pid_t pid = fork();
175 JASSERT(pid != -1);
176 if (pid == 0) {
177 pid_t gchild = fork();
178 JASSERT(gchild != -1);
179 if (gchild != 0) {
180 exit(0);
181 }
182 createProcess();
183 } else {
184 JASSERT(waitpid(pid, NULL, 0) == pid);
185 }
186 }
187
188 void createProcess(bool createIndependentRootProcesses = false)
189 {
190 UniquePid::ThisProcess() = _pInfo.upid();
191 UniquePid::ParentProcess() = _pInfo.uppid();
192 Util::initializeLogFile(_pInfo.procname());
193
194 if (createIndependentRootProcesses) {
195 DmtcpUniqueProcessId compId = _pInfo.compGroup().upid();
196 CoordinatorInfo coordInfo;
197 struct in_addr localIPAddr;
198 if (_pInfo.noCoordinator()) {
199 allowedModes = COORD_NONE;
200 }
201
202 // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used
203 const char *host = NULL;
204 int port = UNINITIALIZED_PORT;
205 Util::getCoordHostAndPort(allowedModes, &host, &port);
206 // FIXME: We will use the new HOST and PORT here, but after restart,,
207 // we will use the old HOST and PORT from the ckpt image.
208 CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes,
209 _pInfo.procname(),
210 _pInfo.compGroup(),
211 _pInfo.numPeers(),
212 &coordInfo,
213 host,
214 port,
215 &localIPAddr);
216 // If port was 0, we'll get new random port when coordinator starts up.
217 Util::getCoordHostAndPort(allowedModes, &host, &port);
218 Util::writeCoordPortToFile(port, thePortFile.c_str());
219
220 string installDir =
221 jalib::Filesystem::DirName(jalib::Filesystem::GetProgramDir());
222
223 #if defined(__i386__) || defined(__arm__)
224 if (Util::strEndsWith(installDir, "/lib/dmtcp/32")) {
225 // If dmtcp_launch was compiled for 32 bits in 64-bit O/S, then note:
226 // DMTCP_ROOT/bin/dmtcp_launch is a symbolic link to:
227 // DMTCP_ROOT/bin/dmtcp_launch/lib/dmtcp/32/bin
228 // GetProgramDir() followed the link. So, need to remove the suffix.
229 char *str = const_cast<char*>(installDir.c_str());
230 str[strlen(str) - strlen("/lib/dmtcp/32")] = '\0';
231 installDir = str;
232 }
233 #endif
234
235 /* We need to initialize SharedData here to make sure that it is
236 * initialized with the correct coordinator timestamp. The coordinator
237 * timestamp is updated only during postCkpt callback. However, the
238 * SharedData area may be initialized earlier (for example, while
239 * recreating threads), causing it to use *older* timestamp.
240 */
241 SharedData::initialize(tmpDir.c_str(),
242 installDir.c_str(),
243 &compId,
244 &coordInfo,
245 &localIPAddr);
246
247 Util::prepareDlsymWrapper();
248 }
249
250 JTRACE("Creating process during restart") (upid()) (_pInfo.procname());
251
252 RestoreTargetMap::iterator it;
253 for (it = targets.begin(); it != targets.end(); it++) {
254 RestoreTarget *t = it->second;
255 if (_pInfo.upid() == t->_pInfo.upid()) {
256 continue;
257 } else if (_pInfo.isChild(t->upid()) &&
258 t->_pInfo.sid() != _pInfo.pid()) {
259 t->createDependentChildProcess();
260 }
261 }
262
263 if (createIndependentRootProcesses) {
264 RestoreTargetMap::iterator it;
265 for (it = independentProcessTreeRoots.begin();
266 it != independentProcessTreeRoots.end();
267 it++) {
268 RestoreTarget *t = it->second;
269 if (t != this) {
270 t->createDependentNonChildProcess();
271 }
272 }
273 }
274
275 // If we were the session leader, become one now.
276 if (_pInfo.sid() == _pInfo.pid()) {
277 if (getsid(0) != _pInfo.pid()) {
278 JWARNING(setsid() != -1) (getsid(0)) (JASSERT_ERRNO)
279 .Text("Failed to restore this process as session leader.");
280 }
281 }
282
283 // Now recreate processes with sid == _pid
284 for (it = targets.begin(); it != targets.end(); it++) {
285 RestoreTarget *t = it->second;
286 if (_pInfo.upid() == t->_pInfo.upid()) {
287 continue;
288 } else if (t->_pInfo.sid() == _pInfo.pid()) {
289 if (_pInfo.isChild(t->upid())) {
290 t->createDependentChildProcess();
291 } else if (t->isRootOfProcessTree()) {
292 t->createDependentNonChildProcess();
293 }
294 }
295 }
296
297 // Now close all open fds except _fd;
298 for (it = targets.begin(); it != targets.end(); it++) {
299 RestoreTarget *t = it->second;
300 if (t != this) {
301 close(t->fd());
302 }
303 }
304
305 string ckptDir = jalib::Filesystem::GetDeviceName(PROTECTED_CKPT_DIR_FD);
306 if (ckptDir.length() == 0) {
307 // Create the ckpt-dir fd so that the restarted process can know about
308 // the abs-path of ckpt-image.
309 string dirName = jalib::Filesystem::DirName(_path);
310 int dirfd = open(dirName.c_str(), O_RDONLY);
311 JASSERT(dirfd != -1) (JASSERT_ERRNO);
312 if (dirfd != PROTECTED_CKPT_DIR_FD) {
313 JASSERT(dup2(dirfd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD);
314 close(dirfd);
315 }
316 }
317
318 if (!createIndependentRootProcesses) {
319 // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used
320 const char *host = NULL;
321 int port = UNINITIALIZED_PORT;
322 int *port_p = &port;
323 Util::getCoordHostAndPort(allowedModes, &host, port_p);
324 CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes,
325 _pInfo.procname(),
326 _pInfo.compGroup(),
327 _pInfo.numPeers(),
328 NULL,
329 host,
330 port,
331 NULL);
332 }
333
334 setEnvironFd();
335 int is32bitElf = 0;
336
337 #if defined(__x86_64__) || defined(__aarch64__)
338 is32bitElf = (_pInfo.elfType() == ProcessInfo::Elf_32);
339 #elif defined(__i386__) || defined(__arm__)
340 is32bitElf = true;
341 #endif
342 runMtcpRestart(is32bitElf, _fd, &_pInfo);
343
344 JASSERT ( false ).Text ( "unreachable" );
345 }
346
347 private:
348 string _path;
349 ProcessInfo _pInfo;
350 int _fd;
351 };
352
353 static void runMtcpRestart(int is32bitElf, int fd, ProcessInfo *pInfo)
354 {
355 char fdBuf[8];
356 char stderrFdBuf[8];
357 sprintf(fdBuf, "%d", fd);
358 sprintf(stderrFdBuf, "%d", PROTECTED_STDERR_FD);
359
360 #ifdef HAS_PR_SET_PTRACER
361 if (getenv("DMTCP_GDB_ATTACH_ON_RESTART")) {
362 JNOTE("\n *******************************************************\n"
363 " *** Environment variable, DMTCP_GDB_ATTACH_ON_RESTART is set\n"
364 " *** You can attach to the running process as follows:\n"
365 " *** gdb _PROGRAM_NAME_ PID [See below for PID.]\n"
366 " *** NOTE: This mode can be a security risk.\n"
367 " *** Do not set the env. variable normally.\n"
368 " *******************************************************")
369 (getpid());
370 prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); // Allow 'gdb attach'
371 }
372 #endif
373
374 static string mtcprestart = Util::getPath ("mtcp_restart");
375
376 #if defined(CONFIG_M32)
377 if (is32bitElf) {
378 mtcprestart = Util::getPath("mtcp_restart-32", is32bitElf);
379 }
380 #endif
381
382 char* const newArgs[] = {
383 (char*) mtcprestart.c_str(),
384 const_cast<char*> ("--fd"), fdBuf,
385 const_cast<char*> ("--stderr-fd"), stderrFdBuf,
386 NULL
387 };
388
389 execve (newArgs[0], newArgs, environ);
390 JASSERT (false) (newArgs[0]) (newArgs[1]) (JASSERT_ERRNO)
391 .Text ("exec() failed");
392 }
393
394 // ************************ For reading checkpoint files *****************
395
396 int readCkptHeader(const string& path, ProcessInfo *pInfo)
397 {
398 int fd = openCkptFileToRead(path);
399 const size_t len = strlen(DMTCP_FILE_HEADER);
400
401 jalib::JBinarySerializeReaderRaw rdr("", fd);
402 pInfo->serialize(rdr);
403 size_t numRead = len + rdr.bytes();
404
405 // We must read in multiple of PAGE_SIZE
406 const ssize_t pagesize = Util::pageSize();
407 ssize_t remaining = pagesize - (numRead % pagesize);
408 char buf[remaining];
409 JASSERT(Util::readAll(fd, buf, remaining) == remaining);
410 return fd;
411 }
412
413 static char first_char(const char *filename)
414 {
415 int fd, rc;
416 char c;
417
418 fd = open(filename, O_RDONLY);
419 JASSERT(fd >= 0) (filename) .Text("ERROR: Cannot open filename");
420
421 rc = read(fd, &c, 1);
422 JASSERT(rc == 1) (filename) .Text("ERROR: Error reading from filename");
423
424 close(fd);
425 return c;
426 }
427
428 // Copied from mtcp/mtcp_restart.c.
429 // Let's keep this code close to MTCP code to avoid maintenance problems.
430 // MTCP code in: mtcp/mtcp_restart.c:open_ckpt_to_read()
431 // A previous version tried to replace this with popen, causing a regression:
432 // (no call to pclose, and possibility of using a wrong fd).
433 // Returns fd;
434 static int open_ckpt_to_read(const char *filename)
435 {
436 int fd;
437 int fds[2];
438 char fc;
439 const char *decomp_path;
440 const char **decomp_args;
441 const char *gzip_path = "gzip";
442 static const char * gzip_args[] = {
443 const_cast<char*> ("gzip"),
444 const_cast<char*> ("-d"),
445 const_cast<char*> ("-"),
446 NULL
447 };
448 #ifdef HBICT_DELTACOMP
449 const char *hbict_path = const_cast<char*> ("hbict");
450 static const char *hbict_args[] = {
451 const_cast<char*> ("hbict"),
452 const_cast<char*> ("-r"),
453 NULL
454 };
455 #endif
456 pid_t cpid;
457
458 fc = first_char(filename);
459 fd = open(filename, O_RDONLY);
460 JASSERT(fd>=0)(filename).Text("Failed to open file.");
461
462 if (fc == DMTCP_MAGIC_FIRST) { /* no compression */
463 return fd;
464 }
465 else if (fc == GZIP_FIRST
466 #ifdef HBICT_DELTACOMP
467 || fc == HBICT_FIRST
468 #endif
469 ) {
470 if (fc == GZIP_FIRST) {
471 decomp_path = gzip_path;
472 decomp_args = gzip_args;
473 }
474 #ifdef HBICT_DELTACOMP
475 else {
476 decomp_path = hbict_path;
477 decomp_args = hbict_args;
478 }
479 #endif
480
481 JASSERT(pipe(fds) != -1) (filename)
482 .Text("Cannot create pipe to execute gunzip to decompress ckpt file!");
483
484 cpid = fork();
485
486 JASSERT(cpid != -1)
487 .Text("ERROR: Cannot fork to execute gunzip to decompress ckpt file!");
488 if (cpid > 0) { /* parent process */
489 JTRACE("created child process to uncompress checkpoint file") (cpid);
490 close(fd);
491 close(fds[1]);
492 // Wait for child process
493 JASSERT(waitpid(cpid, NULL, 0) == cpid);
494 return fds[0];
495 } else { /* child process */
496 /* Fork a grandchild process and kill the parent. This way the grandchild
497 * process never becomes a zombie.
498 *
499 * Sometimes dmtcp_restart is called with multiple ckpt images. In that
500 * situation, the dmtcp_restart process creates gzip processes and only
501 * later forks mtcp_restart processes. The gzip processes can not be
502 * wait()'d upon by the corresponding mtcp_restart processes because
503 * their parent is the original dmtcp_restart process and thus they
504 * become zombie.
505 */
506 cpid = fork();
507 JASSERT(cpid != -1);
508 if (cpid > 0) {
509 // Use _exit() instead of exit() to avoid popping atexit() handlers
510 // registered by the parent process.
511 _exit(0);
512 }
513
514 // Grandchild process
515 JTRACE ( "child process, will exec into external de-compressor");
516 fd = dup(dup(dup(fd)));
517 fds[1] = dup(fds[1]);
518 close(fds[0]);
519 JASSERT(fd != -1);
520 JASSERT(dup2(fd, STDIN_FILENO) == STDIN_FILENO);
521 close(fd);
522 JASSERT(dup2(fds[1], STDOUT_FILENO) == STDOUT_FILENO);
523 close(fds[1]);
524 execvp(decomp_path, (char **)decomp_args);
525 JASSERT(decomp_path!=NULL) (decomp_path)
526 .Text("Failed to launch gzip.");
527 /* should not get here */
528 JASSERT(false)
529 .Text("Decompression failed! No restoration will be performed!");
530 }
531 } else { /* invalid magic number */
532 JASSERT(false)
533 .Text("ERROR: Invalid magic number in this checkpoint file!");
534 }
535 return -1;
536 }
537
538 // See comments above for open_ckpt_to_read()
539 int openCkptFileToRead(const string& path)
540 {
541 char buf[1024];
542 int fd = open_ckpt_to_read(path.c_str());
543 // The rest of this function is for compatibility with original definition.
544 JASSERT(fd >= 0) (path) .Text("Failed to open file.");
545 const int len = strlen(DMTCP_FILE_HEADER);
546 JASSERT(read(fd, buf, len) == len)(path) .Text("read() failed");
547 if (strncmp(buf, DMTCP_FILE_HEADER, len) == 0) {
548 JTRACE("opened checkpoint file [uncompressed]")(path);
549 } else {
550 close(fd);
551 fd = open_ckpt_to_read(path.c_str()); /* Re-open from beginning */
552 JASSERT(fd >= 0) (path) .Text("Failed to open file.");
553 }
554 return fd;
555 }
556 // ************************ End of for reading checkpoint files *************
557
558
559 static void setEnvironFd()
560 {
561 char envFile[PATH_MAX];
562 sprintf(envFile, "%s/envFile.XXXXXX", tmpDir.c_str());
563 int fd = mkstemp(envFile);
564 JASSERT(fd != -1) (envFile) (JASSERT_ERRNO);
565 JASSERT(unlink(envFile) == 0) (envFile) (JASSERT_ERRNO);
566 JASSERT(dup2(fd, PROTECTED_ENVIRON_FD) == PROTECTED_ENVIRON_FD)
567 (JASSERT_ERRNO);
568 JASSERT(close(fd) == 0);
569 fd = PROTECTED_ENVIRON_FD;
570
571 char **env = environ;
572 while (*env != NULL) {
573 Util::writeAll(fd, *env, strlen(*env) + 1); // Also write null character
574 env++;
575 }
576 Util::writeAll(fd, *env, 1); // Write final null character
577 }
578
579 static void setNewCkptDir(char *path)
580 {
581 struct stat st;
582 if (stat(path, &st) == -1) {
583 JASSERT(mkdir(path, S_IRWXU) == 0 || errno == EEXIST)
584 (JASSERT_ERRNO) (path)
585 .Text("Error creating checkpoint directory");
586 JASSERT(0 == access(path, X_OK|W_OK)) (path)
587 .Text("ERROR: Missing execute- or write-access to checkpoint dir");
588 } else {
589 JASSERT(S_ISDIR(st.st_mode)) (path) .Text("ckptdir not a directory");
590 }
591
592 int fd = open(path, O_RDONLY);
593 JASSERT(fd != -1) (path);
594 JASSERT(dup2(fd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD)
595 (fd) (path);
596 if (fd != PROTECTED_CKPT_DIR_FD) {
597 close(fd);
598 }
599 }
600
601 //shift args
602 #define shift argc--,argv++
603
604 int main(int argc, char** argv)
605 {
606 char *tmpdir_arg = NULL;
607
608 initializeJalib();
609
610 if (!getenv(ENV_VAR_QUIET)) {
611 setenv(ENV_VAR_QUIET, "0", 0);
612 }
613
614 if (getenv(ENV_VAR_DISABLE_UID_CHECKING)) {
615 noStrictUIDChecking = true;
616 }
617
618 if (argc == 1) {
619 printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO);
620 printf("(For help: %s --help)\n\n", argv[0]);
621 return DMTCP_FAIL_RC;
622 }
623
624 //process args
625 shift;
626 while (true) {
627 string s = argc>0 ? argv[0] : "--help";
628 if (s == "--help" && argc == 1) {
629 printf("%s", theUsage);
630 return DMTCP_FAIL_RC;
631 } else if ((s == "--version") && argc == 1) {
632 printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO);
633 return DMTCP_FAIL_RC;
634 } else if (s == "-j" || s == "--join") {
635 allowedModes = COORD_JOIN;
636 shift;
637 } else if (s == "--new-coordinator") {
638 allowedModes = COORD_NEW;
639 shift;
640 } else if (s == "--run-as-root") {
641 runAsRoot = true;
642 shift;
643 } else if (s == "--no-strict-uid-checking") {
644 noStrictUIDChecking = true;
645 shift;
646 } else if (s == "-i" || s == "--interval") {
647 setenv(ENV_VAR_CKPT_INTR, argv[1], 1);
648 shift; shift;
649 } else if (argv[0][0] == '-' && argv[0][1] == 'i' &&
650 isdigit(argv[0][2])) { // else if -i5, for example
651 setenv(ENV_VAR_CKPT_INTR, argv[0]+2, 1);
652 shift;
653 } else if (argc > 1 && (s == "-h" || s == "--coord-host" || s == "--host")){
654 setenv(ENV_VAR_NAME_HOST, argv[1], 1);
655 shift; shift;
656 } else if (argc>1 && (s == "-p" || s == "--coord-port" || s == "--port")) {
657 setenv(ENV_VAR_NAME_PORT, argv[1], 1);
658 shift; shift;
659 } else if (argv[0][0] == '-' && argv[0][1] == 'p' &&
660 isdigit(argv[0][2])) { // else if -p0, for example
661 setenv(ENV_VAR_NAME_PORT, argv[0]+2, 1);
662 shift;
663 } else if (argc>1 && s == "--port-file"){
664 thePortFile = argv[1];
665 shift; shift;
666 } else if (argc > 1 && (s == "-c" || s == "--ckptdir")) {
667 setNewCkptDir(argv[1]);
668 shift; shift;
669 } else if (argc > 1 && (s == "-t" || s == "--tmpdir")) {
670 tmpdir_arg = argv[1];
671 shift; shift;
672 } else if (s == "-q" || s == "--quiet") {
673 *getenv(ENV_VAR_QUIET) = *getenv(ENV_VAR_QUIET) + 1;
674 // Just in case a non-standard version of setenv is being used:
675 setenv(ENV_VAR_QUIET, getenv(ENV_VAR_QUIET), 1);
676 shift;
677 } else if ((s.length() > 2 && s.substr(0, 2) == "--") ||
678 (s.length() > 1 && s.substr(0, 1) == "-")) {
679 printf("Invalid Argument\n%s", theUsage);
680 return DMTCP_FAIL_RC;
681 } else if (argc > 1 && s == "--") {
682 shift;
683 break;
684 } else {
685 break;
686 }
687 }
688
689 tmpDir = Util::calcTmpDir(tmpdir_arg);
690
691 jassert_quiet = *getenv(ENV_VAR_QUIET) - '0';
692
693 //make sure JASSERT initializes now, rather than during restart
694 Util::initializeLogFile(tmpDir);
695
696 if (!runAsRoot && (getuid() == 0 || geteuid() == 0)) {
697 JASSERT_STDERR <<
698 "Running dmtcp_restart as root is dangerous. Aborting.\n"
699 "If you still want to do this (at your own risk), then use\n" \
700 " dmtcp_restart --run-as-root\n";
701 exit(0);
702 }
703
704 JTRACE("New dmtcp_restart process; _argc_ ckpt images") (argc);
705
706 bool doAbort = false;
707 for (; argc > 0; shift) {
708 string restorename(argv[0]);
709 struct stat buf;
710 int rc = stat(restorename.c_str(), &buf);
711 if (Util::strEndsWith(restorename, "_files")) {
712 continue;
713 } else if (!Util::strEndsWith(restorename, ".dmtcp")) {
714 JNOTE("File doesn't have .dmtcp extension. Check Usage.")
715 (restorename);
716 JASSERT_STDERR << theUsage;
717 doAbort = true;
718 } else if (rc == -1) {
719 char error_msg[1024];
720 sprintf(error_msg, "\ndmtcp_restart: ckpt image %s", restorename.c_str());
721 perror(error_msg);
722 doAbort = true;
723 } else if (buf.st_uid != getuid() && !noStrictUIDChecking && !runAsRoot) {
724 /*Could also run if geteuid() matches*/
725 printf("\nProcess uid (%d) doesn't match uid (%d) of\n" \
726 "checkpoint image (%s).\n" \
727 "This is dangerous. Aborting for security reasons.\n" \
728 "If you still want to do this (at your own risk),\n" \
729 " then modify dmtcp/src/%s:%d and re-compile.\n",
730 getuid(), buf.st_uid, restorename.c_str(), __FILE__, __LINE__ - 7);
731 doAbort = true;
732 }
733 if (doAbort) {
734 exit(DMTCP_FAIL_RC);
735 }
736
737 JTRACE("Will restart ckpt image") (argv[0]);
738 RestoreTarget *t = new RestoreTarget(argv[0]);
739 targets[t->upid()] = t;
740 }
741
742 // Prepare list of independent process tree roots
743 RestoreTargetMap::iterator i;
744 for (i = targets.begin(); i != targets.end(); i++) {
745 RestoreTarget *t1 = i->second;
746 if (t1->isRootOfProcessTree()) {
747 RestoreTargetMap::iterator j;
748 for (j = targets.begin(); j != targets.end(); j++) {
749 RestoreTarget *t2 = j->second;
750 if (t1 == t2) continue;
751 if (t1->sid() == t2->pid()) {
752 break;
753 }
754 }
755 if (j == targets.end()) {
756 independentProcessTreeRoots[t1->upid()] = t1;
757 }
758 }
759 }
760 JASSERT(independentProcessTreeRoots.size() > 0)
761 .Text("There must be at least one process tree that doesn't have\n"
762 " a different process as session leader.");
763
764 WorkerState::setCurrentState(WorkerState::RESTARTING);
765
766 RestoreTarget *t = independentProcessTreeRoots.begin()->second;
767 JASSERT(t->pid() != 0);
768 JASSERT(!t->noCoordinator() || allowedModes == COORD_ANY)
769 .Text("Process had no coordinator prior to checkpoint;\n"
770 " but either --join or --new-coordinator was specified.");
771 t->createProcess(true);
772 JASSERT(false).Text("unreachable");
773 return -1;
774 }