root/dmtcp_restart.cpp

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fd
  2. upid
  3. sid
  4. isRootOfProcessTree
  5. procname
  6. compGroup
  7. numPeers
  8. noCoordinator
  9. restoreGroup
  10. createDependentChildProcess
  11. createDependentNonChildProcess
  12. createProcess
  13. runMtcpRestart
  14. readCkptHeader
  15. first_char
  16. open_ckpt_to_read
  17. openCkptFileToRead
  18. setEnvironFd
  19. setNewCkptDir
  20. main

   1 /****************************************************************************
   2  *   Copyright (C) 2006-2013 by Jason Ansel, Kapil Arya, and Gene Cooperman *
   3  *   jansel@csail.mit.edu, kapil@ccs.neu.edu, gene@ccs.neu.edu              *
   4  *                                                                          *
   5  *  This file is part of DMTCP.                                             *
   6  *                                                                          *
   7  *  DMTCP is free software: you can redistribute it and/or                  *
   8  *  modify it under the terms of the GNU Lesser General Public License as   *
   9  *  published by the Free Software Foundation, either version 3 of the      *
  10  *  License, or (at your option) any later version.                         *
  11  *                                                                          *
  12  *  DMTCP is distributed in the hope that it will be useful,                *
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of          *
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           *
  15  *  GNU Lesser General Public License for more details.                     *
  16  *                                                                          *
  17  *  You should have received a copy of the GNU Lesser General Public        *
  18  *  License along with DMTCP:dmtcp/src.  If not, see                        *
  19  *  <http://www.gnu.org/licenses/>.                                         *
  20  ****************************************************************************/
  21 
  22 #include <stdio.h>
  23 #include <sys/stat.h>
  24 #include <sys/fcntl.h>
  25 #include <sys/wait.h>
  26 #include <sys/mman.h>
  27 #include <limits.h>
  28 #include <elf.h>
  29 #include "config.h"
  30 #ifdef HAS_PR_SET_PTRACER
  31 # include <sys/prctl.h>
  32 #endif
  33 
  34 #include "constants.h"
  35 #include "coordinatorapi.h"
  36 #include "util.h"
  37 #include "uniquepid.h"
  38 #include "processinfo.h"
  39 #include "shareddata.h"
  40 #include  "../jalib/jassert.h"
  41 #include  "../jalib/jfilesystem.h"
  42 
  43 #define BINARY_NAME "dmtcp_restart"
  44 #define MTCP_RESTART_BINARY "mtcp_restart"
  45 
  46 using namespace dmtcp;
  47 
  48 // Copied from mtcp/mtcp_restart.c.
  49 #define DMTCP_MAGIC_FIRST 'D'
  50 #define GZIP_FIRST 037
  51 #ifdef HBICT_DELTACOMP
  52 #define HBICT_FIRST 'H'
  53 #endif
  54 
  55 static void setEnvironFd();
  56 
  57 string tmpDir = "/DMTCP/Uninitialized/Tmp/Dir";
  58 
  59 // gcc-4.3.4 -Wformat=2 issues false positives for warnings unless the format
  60 // string has at least one format specifier with corresponding format argument.
  61 // Ubuntu 9.01 uses -Wformat=2 by default.
  62 static const char* theUsage =
  63   "Usage: dmtcp_restart [OPTIONS] <ckpt1.dmtcp> [ckpt2.dmtcp...]\n\n"
  64   "Restart processes from a checkpoint image.\n\n"
  65   "Connecting to the DMTCP Coordinator:\n"
  66   "  -h, --coord-host HOSTNAME (environment variable DMTCP_COORD_HOST)\n"
  67   "              Hostname where dmtcp_coordinator is run (default: localhost)\n"
  68   "  -p, --coord-port PORT_NUM (environment variable DMTCP_COORD_PORT)\n"
  69   "              Port where dmtcp_coordinator is run (default: 7779)\n"
  70   "  --port-file FILENAME\n"
  71   "              File to write listener port number.\n"
  72   "              (Useful with '--port 0', which is used to assign a random port)\n"
  73   "  -j, --join\n"
  74   "              Join an existing coordinator, raise error if one doesn't\n"
  75   "              already exist\n"
  76   "  --new-coordinator\n"
  77   "              Create a new coordinator at the given port. Fail if one\n"
  78   "              already exists on the given port. The port can be specified\n"
  79   "              with --coord-port, or with environment variable DMTCP_COORD_PORT.\n"
  80   "              If no port is specified, start coordinator at a random port\n"
  81   "              (same as specifying port '0').\n"
  82   "  -i, --interval SECONDS (environment variable DMTCP_CHECKPOINT_INTERVAL)\n"
  83   "              Time in seconds between automatic checkpoints.\n"
  84   "              0 implies never (manual ckpt only); if not set and no env var,\n"
  85   "              use default value set in dmtcp_coordinator or dmtcp_command.\n"
  86   "              Not allowed if --join is specified\n"
  87   "\n"
  88   "Other options:\n"
  89   "  --run-as-root\n"
  90   "              Allow root to run dmtcp_restart and disable uid checking.\n"
  91   "              (default: disabled)\n"
  92   "  --no-strict-uid-checking\n"
  93   "              Disable uid checking for the checkpoint image. This allows\n"
  94   "              the checkpoint image to be restarted by a different user\n"
  95   "              than the one that created it.\n"
  96   "              (environment variable DMTCP_DISABLE_UID_CHECKING)\n"
  97   "  --ckptdir (environment variable DMTCP_CHECKPOINT_DIR):\n"
  98   "              Directory to store checkpoint images\n"
  99   "              (default: use the same directory used in previous checkpoint)\n"
 100   "  --tmpdir PATH (environment variable DMTCP_TMPDIR)\n"
 101   "              Directory to store temporary files (default: $TMDPIR or /tmp)\n"
 102   "  -q, --quiet (or set environment variable DMTCP_QUIET = 0, 1, or 2)\n"
 103   "              Skip NOTE messages; if given twice, also skip WARNINGs\n"
 104   "  --help\n"
 105   "              Print this message and exit.\n"
 106   "  --version\n"
 107   "              Print version information and exit.\n"
 108   "\n"
 109   HELP_AND_CONTACT_INFO
 110   "\n"
 111 ;
 112 
 113 class RestoreTarget;
 114 
 115 typedef map<UniquePid, RestoreTarget*> RestoreTargetMap;
 116 RestoreTargetMap targets;
 117 RestoreTargetMap independentProcessTreeRoots;
 118 bool noStrictUIDChecking = false;
 119 bool runAsRoot = false;
 120 static string thePortFile;
 121 CoordinatorMode allowedModes = COORD_ANY;
 122 
 123 static void setEnvironFd();
 124 static void runMtcpRestart(int is32bitElf, int fd, ProcessInfo *pInfo);
 125 static int readCkptHeader(const string& path, ProcessInfo *pInfo);
 126 static int openCkptFileToRead(const string& path);
 127 
 128 class RestoreTarget
 129 {
 130   public:
 131     RestoreTarget(const string& path)
 132       : _path(path)
 133     {
 134       JASSERT(jalib::Filesystem::FileExists(_path)) (_path)
 135         .Text ( "checkpoint file missing" );
 136 
 137       _fd = readCkptHeader(_path, &_pInfo);
 138       JTRACE("restore target") (_path) (_pInfo.numPeers()) (_pInfo.compGroup());
 139     }
 140 
 141     int fd() const { return _fd; }
 142     const UniquePid& upid() const { return _pInfo.upid(); }
 143     pid_t pid() const { return _pInfo.pid(); }
 144     pid_t sid() const { return _pInfo.sid(); }
 145     bool isRootOfProcessTree() const {
 146       return _pInfo.isRootOfProcessTree();
 147     }
 148     string procname() { return _pInfo.procname(); }
 149     UniquePid compGroup() { return _pInfo.compGroup(); }
 150     int numPeers() { return _pInfo.numPeers(); }
 151     bool noCoordinator() { return _pInfo.noCoordinator(); }
 152 
 153     void restoreGroup()
 154     {
 155       if (_pInfo.isGroupLeader()) {
 156         // create new Group where this process becomes a leader
 157         JTRACE("Create new Group.");
 158         setpgid(0, 0);
 159       }
 160     }
 161 
 162     void createDependentChildProcess()
 163     {
 164       pid_t pid = fork();
 165       JASSERT(pid != -1);
 166       if (pid != 0) {
 167         return;
 168       }
 169       createProcess();
 170     }
 171 
 172     void createDependentNonChildProcess()
 173     {
 174       pid_t pid = fork();
 175       JASSERT(pid != -1);
 176       if (pid == 0) {
 177         pid_t gchild = fork();
 178         JASSERT(gchild != -1);
 179         if (gchild != 0) {
 180           exit(0);
 181         }
 182         createProcess();
 183       } else {
 184         JASSERT(waitpid(pid, NULL, 0) == pid);
 185       }
 186     }
 187 
 188     void createProcess(bool createIndependentRootProcesses = false)
 189     {
 190       UniquePid::ThisProcess() = _pInfo.upid();
 191       UniquePid::ParentProcess() = _pInfo.uppid();
 192       Util::initializeLogFile(_pInfo.procname());
 193 
 194       if (createIndependentRootProcesses) {
 195         DmtcpUniqueProcessId compId = _pInfo.compGroup().upid();
 196         CoordinatorInfo coordInfo;
 197         struct in_addr localIPAddr;
 198         if (_pInfo.noCoordinator()) {
 199           allowedModes = COORD_NONE;
 200         }
 201 
 202         // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used
 203         const char *host = NULL;
 204         int port = UNINITIALIZED_PORT;
 205         Util::getCoordHostAndPort(allowedModes, &host, &port);
 206         // FIXME:  We will use the new HOST and PORT here, but after restart,,
 207         //           we will use the old HOST and PORT from the ckpt image.
 208         CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes,
 209                                                            _pInfo.procname(),
 210                                                            _pInfo.compGroup(),
 211                                                            _pInfo.numPeers(),
 212                                                            &coordInfo,
 213                                                            host,
 214                                                            port,
 215                                                            &localIPAddr);
 216         // If port was 0, we'll get new random port when coordinator starts up.
 217         Util::getCoordHostAndPort(allowedModes, &host, &port);
 218         Util::writeCoordPortToFile(port, thePortFile.c_str());
 219 
 220         string installDir =
 221           jalib::Filesystem::DirName(jalib::Filesystem::GetProgramDir());
 222 
 223 #if defined(__i386__) || defined(__arm__)
 224         if (Util::strEndsWith(installDir, "/lib/dmtcp/32")) {
 225           // If dmtcp_launch was compiled for 32 bits in 64-bit O/S, then note:
 226           // DMTCP_ROOT/bin/dmtcp_launch is a symbolic link to:
 227           //    DMTCP_ROOT/bin/dmtcp_launch/lib/dmtcp/32/bin
 228           // GetProgramDir() followed the link.  So, need to remove the suffix.
 229           char *str = const_cast<char*>(installDir.c_str());
 230           str[strlen(str) - strlen("/lib/dmtcp/32")] = '\0';
 231           installDir = str;
 232         }
 233 #endif
 234 
 235         /* We need to initialize SharedData here to make sure that it is
 236          * initialized with the correct coordinator timestamp.  The coordinator
 237          * timestamp is updated only during postCkpt callback. However, the
 238          * SharedData area may be initialized earlier (for example, while
 239          * recreating threads), causing it to use *older* timestamp.
 240          */
 241         SharedData::initialize(tmpDir.c_str(),
 242                                installDir.c_str(),
 243                                &compId,
 244                                &coordInfo,
 245                                &localIPAddr);
 246 
 247         Util::prepareDlsymWrapper();
 248       }
 249 
 250       JTRACE("Creating process during restart") (upid()) (_pInfo.procname());
 251 
 252       RestoreTargetMap::iterator it;
 253       for (it = targets.begin(); it != targets.end(); it++) {
 254         RestoreTarget *t = it->second;
 255         if (_pInfo.upid() == t->_pInfo.upid()) {
 256           continue;
 257         } else if (_pInfo.isChild(t->upid()) &&
 258                    t->_pInfo.sid() != _pInfo.pid()) {
 259           t->createDependentChildProcess();
 260         }
 261       }
 262 
 263       if (createIndependentRootProcesses) {
 264         RestoreTargetMap::iterator it;
 265         for (it = independentProcessTreeRoots.begin();
 266              it != independentProcessTreeRoots.end();
 267              it++) {
 268           RestoreTarget *t = it->second;
 269           if (t != this) {
 270             t->createDependentNonChildProcess();
 271           }
 272         }
 273       }
 274 
 275       // If we were the session leader, become one now.
 276       if (_pInfo.sid() == _pInfo.pid()) {
 277         if (getsid(0) != _pInfo.pid()) {
 278           JWARNING(setsid() != -1) (getsid(0)) (JASSERT_ERRNO)
 279             .Text("Failed to restore this process as session leader.");
 280         }
 281       }
 282 
 283       // Now recreate processes with sid == _pid
 284       for (it = targets.begin(); it != targets.end(); it++) {
 285         RestoreTarget *t = it->second;
 286         if (_pInfo.upid() == t->_pInfo.upid()) {
 287           continue;
 288         } else if (t->_pInfo.sid() == _pInfo.pid()) {
 289           if (_pInfo.isChild(t->upid())) {
 290             t->createDependentChildProcess();
 291           } else if (t->isRootOfProcessTree()) {
 292             t->createDependentNonChildProcess();
 293           }
 294         }
 295       }
 296 
 297       // Now close all open fds except _fd;
 298       for (it = targets.begin(); it != targets.end(); it++) {
 299         RestoreTarget *t = it->second;
 300         if (t != this) {
 301           close(t->fd());
 302         }
 303       }
 304 
 305       string ckptDir = jalib::Filesystem::GetDeviceName(PROTECTED_CKPT_DIR_FD);
 306       if (ckptDir.length() == 0) {
 307         // Create the ckpt-dir fd so that the restarted process can know about
 308         // the abs-path of ckpt-image.
 309         string dirName = jalib::Filesystem::DirName(_path);
 310         int dirfd = open(dirName.c_str(), O_RDONLY);
 311         JASSERT(dirfd != -1) (JASSERT_ERRNO);
 312         if (dirfd != PROTECTED_CKPT_DIR_FD) {
 313           JASSERT(dup2(dirfd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD);
 314           close(dirfd);
 315         }
 316       }
 317 
 318       if (!createIndependentRootProcesses) {
 319         // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used
 320         const char *host = NULL;
 321         int port = UNINITIALIZED_PORT;
 322         int *port_p = &port;
 323         Util::getCoordHostAndPort(allowedModes, &host, port_p);
 324         CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes,
 325                                                            _pInfo.procname(),
 326                                                            _pInfo.compGroup(),
 327                                                            _pInfo.numPeers(),
 328                                                            NULL,
 329                                                            host,
 330                                                            port,
 331                                                            NULL);
 332       }
 333 
 334       setEnvironFd();
 335       int is32bitElf = 0;
 336 
 337 #if defined(__x86_64__) || defined(__aarch64__)
 338       is32bitElf = (_pInfo.elfType() == ProcessInfo::Elf_32);
 339 #elif defined(__i386__) || defined(__arm__)
 340       is32bitElf = true;
 341 #endif
 342       runMtcpRestart(is32bitElf, _fd, &_pInfo);
 343 
 344       JASSERT ( false ).Text ( "unreachable" );
 345     }
 346 
 347   private:
 348     string _path;
 349     ProcessInfo _pInfo;
 350     int _fd;
 351 };
 352 
 353 static void runMtcpRestart(int is32bitElf, int fd, ProcessInfo *pInfo)
 354 {
 355   char fdBuf[8];
 356   char stderrFdBuf[8];
 357   sprintf(fdBuf, "%d", fd);
 358   sprintf(stderrFdBuf, "%d", PROTECTED_STDERR_FD);
 359 
 360 #ifdef HAS_PR_SET_PTRACER
 361   if (getenv("DMTCP_GDB_ATTACH_ON_RESTART")) {
 362     JNOTE("\n     *******************************************************\n"
 363           "     *** Environment variable, DMTCP_GDB_ATTACH_ON_RESTART is set\n"
 364           "     *** You can attach to the running process as follows:\n"
 365           "     ***     gdb _PROGRAM_NAME_ PID  [See below for PID.]\n"
 366           "     *** NOTE:  This mode can be a security risk.\n"
 367           "     ***        Do not set the env. variable normally.\n"
 368           "     *******************************************************")
 369          (getpid());
 370     prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); // Allow 'gdb attach'
 371   }
 372 #endif
 373 
 374   static string mtcprestart = Util::getPath ("mtcp_restart");
 375 
 376 #if defined(CONFIG_M32)
 377   if (is32bitElf) {
 378     mtcprestart = Util::getPath("mtcp_restart-32", is32bitElf);
 379   }
 380 #endif
 381 
 382   char* const newArgs[] = {
 383     (char*) mtcprestart.c_str(),
 384     const_cast<char*> ("--fd"), fdBuf,
 385     const_cast<char*> ("--stderr-fd"), stderrFdBuf,
 386     NULL
 387   };
 388 
 389   execve (newArgs[0], newArgs, environ);
 390   JASSERT (false) (newArgs[0]) (newArgs[1]) (JASSERT_ERRNO)
 391     .Text ("exec() failed");
 392 }
 393 
 394 // ************************ For reading checkpoint files *****************
 395 
 396 int readCkptHeader(const string& path, ProcessInfo *pInfo)
 397 {
 398   int fd = openCkptFileToRead(path);
 399   const size_t len = strlen(DMTCP_FILE_HEADER);
 400 
 401   jalib::JBinarySerializeReaderRaw rdr("", fd);
 402   pInfo->serialize(rdr);
 403   size_t numRead = len + rdr.bytes();
 404 
 405   // We must read in multiple of PAGE_SIZE
 406   const ssize_t pagesize = Util::pageSize();
 407   ssize_t remaining = pagesize - (numRead % pagesize);
 408   char buf[remaining];
 409   JASSERT(Util::readAll(fd, buf, remaining) == remaining);
 410   return fd;
 411 }
 412 
 413 static char first_char(const char *filename)
 414 {
 415   int fd, rc;
 416   char c;
 417 
 418   fd = open(filename, O_RDONLY);
 419   JASSERT(fd >= 0) (filename) .Text("ERROR: Cannot open filename");
 420 
 421   rc = read(fd, &c, 1);
 422   JASSERT(rc == 1) (filename) .Text("ERROR: Error reading from filename");
 423 
 424   close(fd);
 425   return c;
 426 }
 427 
 428 // Copied from mtcp/mtcp_restart.c.
 429 // Let's keep this code close to MTCP code to avoid maintenance problems.
 430 // MTCP code in:  mtcp/mtcp_restart.c:open_ckpt_to_read()
 431 // A previous version tried to replace this with popen, causing a regression:
 432 //   (no call to pclose, and possibility of using a wrong fd).
 433 // Returns fd;
 434 static int open_ckpt_to_read(const char *filename)
 435 {
 436   int fd;
 437   int fds[2];
 438   char fc;
 439   const char *decomp_path;
 440   const char **decomp_args;
 441   const char *gzip_path = "gzip";
 442   static const char * gzip_args[] = {
 443     const_cast<char*> ("gzip"),
 444     const_cast<char*> ("-d"),
 445     const_cast<char*> ("-"),
 446     NULL
 447   };
 448 #ifdef HBICT_DELTACOMP
 449   const char *hbict_path = const_cast<char*> ("hbict");
 450   static const char *hbict_args[] = {
 451     const_cast<char*> ("hbict"),
 452     const_cast<char*> ("-r"),
 453     NULL
 454   };
 455 #endif
 456   pid_t cpid;
 457 
 458   fc = first_char(filename);
 459   fd = open(filename, O_RDONLY);
 460   JASSERT(fd>=0)(filename).Text("Failed to open file.");
 461 
 462   if (fc == DMTCP_MAGIC_FIRST) { /* no compression */
 463     return fd;
 464   }
 465   else if (fc == GZIP_FIRST
 466 #ifdef HBICT_DELTACOMP
 467            || fc == HBICT_FIRST
 468 #endif
 469           ) {
 470     if (fc == GZIP_FIRST) {
 471       decomp_path = gzip_path;
 472       decomp_args = gzip_args;
 473     }
 474 #ifdef HBICT_DELTACOMP
 475     else {
 476       decomp_path = hbict_path;
 477       decomp_args = hbict_args;
 478     }
 479 #endif
 480 
 481     JASSERT(pipe(fds) != -1) (filename)
 482       .Text("Cannot create pipe to execute gunzip to decompress ckpt file!");
 483 
 484     cpid = fork();
 485 
 486     JASSERT(cpid != -1)
 487       .Text("ERROR: Cannot fork to execute gunzip to decompress ckpt file!");
 488     if (cpid > 0) { /* parent process */
 489       JTRACE("created child process to uncompress checkpoint file") (cpid);
 490       close(fd);
 491       close(fds[1]);
 492       // Wait for child process
 493       JASSERT(waitpid(cpid, NULL, 0) == cpid);
 494       return fds[0];
 495     } else { /* child process */
 496       /* Fork a grandchild process and kill the parent. This way the grandchild
 497        * process never becomes a zombie.
 498        *
 499        * Sometimes dmtcp_restart is called with multiple ckpt images. In that
 500        * situation, the dmtcp_restart process creates gzip processes and only
 501        * later forks mtcp_restart processes. The gzip processes can not be
 502        * wait()'d upon by the corresponding mtcp_restart processes because
 503        * their parent is the original dmtcp_restart process and thus they
 504        * become zombie.
 505        */
 506       cpid = fork();
 507       JASSERT(cpid != -1);
 508       if (cpid > 0) {
 509         // Use _exit() instead of exit() to avoid popping atexit() handlers
 510         // registered by the parent process.
 511         _exit(0);
 512       }
 513 
 514       // Grandchild process
 515       JTRACE ( "child process, will exec into external de-compressor");
 516       fd = dup(dup(dup(fd)));
 517       fds[1] = dup(fds[1]);
 518       close(fds[0]);
 519       JASSERT(fd != -1);
 520       JASSERT(dup2(fd, STDIN_FILENO) == STDIN_FILENO);
 521       close(fd);
 522       JASSERT(dup2(fds[1], STDOUT_FILENO) == STDOUT_FILENO);
 523       close(fds[1]);
 524       execvp(decomp_path, (char **)decomp_args);
 525       JASSERT(decomp_path!=NULL) (decomp_path)
 526         .Text("Failed to launch gzip.");
 527       /* should not get here */
 528       JASSERT(false)
 529         .Text("Decompression failed!  No restoration will be performed!");
 530     }
 531   } else { /* invalid magic number */
 532     JASSERT(false)
 533       .Text("ERROR: Invalid magic number in this checkpoint file!");
 534   }
 535   return -1;
 536 }
 537 
 538 // See comments above for open_ckpt_to_read()
 539 int openCkptFileToRead(const string& path)
 540 {
 541   char buf[1024];
 542   int fd = open_ckpt_to_read(path.c_str());
 543   // The rest of this function is for compatibility with original definition.
 544   JASSERT(fd >= 0) (path) .Text("Failed to open file.");
 545   const int len = strlen(DMTCP_FILE_HEADER);
 546   JASSERT(read(fd, buf, len) == len)(path) .Text("read() failed");
 547   if (strncmp(buf, DMTCP_FILE_HEADER, len) == 0) {
 548     JTRACE("opened checkpoint file [uncompressed]")(path);
 549   } else {
 550     close(fd);
 551     fd = open_ckpt_to_read(path.c_str()); /* Re-open from beginning */
 552     JASSERT(fd >= 0) (path) .Text("Failed to open file.");
 553   }
 554   return fd;
 555 }
 556 // ************************ End of for reading checkpoint files *************
 557 
 558 
 559 static void setEnvironFd()
 560 {
 561   char envFile[PATH_MAX];
 562   sprintf(envFile, "%s/envFile.XXXXXX", tmpDir.c_str());
 563   int fd = mkstemp(envFile);
 564   JASSERT(fd != -1) (envFile) (JASSERT_ERRNO);
 565   JASSERT(unlink(envFile) == 0) (envFile) (JASSERT_ERRNO);
 566   JASSERT(dup2(fd, PROTECTED_ENVIRON_FD) == PROTECTED_ENVIRON_FD)
 567     (JASSERT_ERRNO);
 568   JASSERT(close(fd) == 0);
 569   fd = PROTECTED_ENVIRON_FD;
 570 
 571   char **env = environ;
 572   while (*env != NULL) {
 573     Util::writeAll(fd, *env, strlen(*env) + 1); // Also write null character
 574     env++;
 575   }
 576   Util::writeAll(fd, *env, 1); // Write final null character
 577 }
 578 
 579 static void setNewCkptDir(char *path)
 580 {
 581   struct stat st;
 582   if (stat(path, &st) == -1) {
 583     JASSERT(mkdir(path, S_IRWXU) == 0 || errno == EEXIST)
 584       (JASSERT_ERRNO) (path)
 585       .Text("Error creating checkpoint directory");
 586     JASSERT(0 == access(path, X_OK|W_OK)) (path)
 587       .Text("ERROR: Missing execute- or write-access to checkpoint dir");
 588   } else {
 589     JASSERT(S_ISDIR(st.st_mode)) (path) .Text("ckptdir not a directory");
 590   }
 591 
 592   int fd = open(path, O_RDONLY);
 593   JASSERT(fd != -1) (path);
 594   JASSERT(dup2(fd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD)
 595     (fd) (path);
 596   if (fd != PROTECTED_CKPT_DIR_FD) {
 597     close(fd);
 598   }
 599 }
 600 
 601 //shift args
 602 #define shift argc--,argv++
 603 
 604 int main(int argc, char** argv)
 605 {
 606   char *tmpdir_arg = NULL;
 607 
 608   initializeJalib();
 609 
 610   if (!getenv(ENV_VAR_QUIET)) {
 611     setenv(ENV_VAR_QUIET, "0", 0);
 612   }
 613 
 614   if (getenv(ENV_VAR_DISABLE_UID_CHECKING)) {
 615     noStrictUIDChecking = true;
 616   }
 617 
 618   if (argc == 1) {
 619     printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO);
 620     printf("(For help: %s --help)\n\n", argv[0]);
 621     return DMTCP_FAIL_RC;
 622   }
 623 
 624   //process args
 625   shift;
 626   while (true) {
 627     string s = argc>0 ? argv[0] : "--help";
 628     if (s == "--help" && argc == 1) {
 629       printf("%s", theUsage);
 630       return DMTCP_FAIL_RC;
 631     } else if ((s == "--version") && argc == 1) {
 632       printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO);
 633       return DMTCP_FAIL_RC;
 634     } else if (s == "-j" || s == "--join") {
 635       allowedModes = COORD_JOIN;
 636       shift;
 637     } else if (s == "--new-coordinator") {
 638       allowedModes = COORD_NEW;
 639       shift;
 640     } else if (s == "--run-as-root") {
 641       runAsRoot = true;
 642       shift;
 643     } else if (s == "--no-strict-uid-checking") {
 644       noStrictUIDChecking = true;
 645       shift;
 646     } else if (s == "-i" || s == "--interval") {
 647       setenv(ENV_VAR_CKPT_INTR, argv[1], 1);
 648       shift; shift;
 649     } else if (argv[0][0] == '-' && argv[0][1] == 'i' &&
 650                isdigit(argv[0][2])) { // else if -i5, for example
 651       setenv(ENV_VAR_CKPT_INTR, argv[0]+2, 1);
 652       shift;
 653     } else if (argc > 1 && (s == "-h" || s == "--coord-host" || s == "--host")){
 654       setenv(ENV_VAR_NAME_HOST, argv[1], 1);
 655       shift; shift;
 656     } else if (argc>1 && (s == "-p" || s == "--coord-port" || s == "--port")) {
 657       setenv(ENV_VAR_NAME_PORT, argv[1], 1);
 658       shift; shift;
 659     } else if (argv[0][0] == '-' && argv[0][1] == 'p' &&
 660                isdigit(argv[0][2])) { // else if -p0, for example
 661       setenv(ENV_VAR_NAME_PORT, argv[0]+2, 1);
 662       shift;
 663     } else if (argc>1 && s == "--port-file"){
 664       thePortFile = argv[1];
 665       shift; shift;
 666     } else if (argc > 1 && (s == "-c" || s == "--ckptdir")) {
 667       setNewCkptDir(argv[1]);
 668       shift; shift;
 669     } else if (argc > 1 && (s == "-t" || s == "--tmpdir")) {
 670       tmpdir_arg = argv[1];
 671       shift; shift;
 672     } else if (s == "-q" || s == "--quiet") {
 673       *getenv(ENV_VAR_QUIET) = *getenv(ENV_VAR_QUIET) + 1;
 674       // Just in case a non-standard version of setenv is being used:
 675       setenv(ENV_VAR_QUIET, getenv(ENV_VAR_QUIET), 1);
 676       shift;
 677     } else if ((s.length() > 2 && s.substr(0, 2) == "--") ||
 678                (s.length() > 1 && s.substr(0, 1) == "-")) {
 679       printf("Invalid Argument\n%s", theUsage);
 680       return DMTCP_FAIL_RC;
 681     } else if (argc > 1 && s == "--") {
 682       shift;
 683       break;
 684     } else {
 685       break;
 686     }
 687   }
 688 
 689   tmpDir = Util::calcTmpDir(tmpdir_arg);
 690 
 691   jassert_quiet = *getenv(ENV_VAR_QUIET) - '0';
 692 
 693   //make sure JASSERT initializes now, rather than during restart
 694   Util::initializeLogFile(tmpDir);
 695 
 696   if (!runAsRoot && (getuid() == 0 || geteuid() == 0)) {
 697     JASSERT_STDERR <<
 698       "Running dmtcp_restart as root is dangerous.  Aborting.\n"
 699       "If you still want to do this (at your own risk), then use\n" \
 700       "    dmtcp_restart --run-as-root\n";
 701     exit(0);
 702   }
 703 
 704   JTRACE("New dmtcp_restart process; _argc_ ckpt images") (argc);
 705 
 706   bool doAbort = false;
 707   for (; argc > 0; shift) {
 708     string restorename(argv[0]);
 709     struct stat buf;
 710     int rc = stat(restorename.c_str(), &buf);
 711     if (Util::strEndsWith(restorename, "_files")) {
 712       continue;
 713     } else if (!Util::strEndsWith(restorename, ".dmtcp")) {
 714       JNOTE("File doesn't have .dmtcp extension. Check Usage.")
 715         (restorename);
 716       JASSERT_STDERR << theUsage;
 717       doAbort = true;
 718     } else if (rc == -1) {
 719       char error_msg[1024];
 720       sprintf(error_msg, "\ndmtcp_restart: ckpt image %s", restorename.c_str());
 721       perror(error_msg);
 722       doAbort = true;
 723     } else if (buf.st_uid != getuid() && !noStrictUIDChecking && !runAsRoot) {
 724       /*Could also run if geteuid() matches*/
 725       printf("\nProcess uid (%d) doesn't match uid (%d) of\n" \
 726              "checkpoint image (%s).\n" \
 727              "This is dangerous.  Aborting for security reasons.\n" \
 728              "If you still want to do this (at your own risk),\n" \
 729              "  then modify dmtcp/src/%s:%d and re-compile.\n",
 730              getuid(), buf.st_uid, restorename.c_str(), __FILE__, __LINE__ - 7);
 731       doAbort = true;
 732     }
 733     if (doAbort) {
 734       exit(DMTCP_FAIL_RC);
 735     }
 736 
 737     JTRACE("Will restart ckpt image") (argv[0]);
 738     RestoreTarget *t = new RestoreTarget(argv[0]);
 739     targets[t->upid()] = t;
 740   }
 741 
 742   // Prepare list of independent process tree roots
 743   RestoreTargetMap::iterator i;
 744   for (i = targets.begin(); i != targets.end(); i++) {
 745     RestoreTarget *t1 = i->second;
 746     if (t1->isRootOfProcessTree()) {
 747       RestoreTargetMap::iterator j;
 748       for (j = targets.begin(); j != targets.end(); j++) {
 749         RestoreTarget *t2 = j->second;
 750         if (t1 == t2) continue;
 751         if (t1->sid() == t2->pid()) {
 752           break;
 753         }
 754       }
 755       if (j == targets.end()) {
 756         independentProcessTreeRoots[t1->upid()] = t1;
 757       }
 758     }
 759   }
 760   JASSERT(independentProcessTreeRoots.size() > 0)
 761     .Text("There must be at least one process tree that doesn't have\n"
 762           "  a different process as session leader.");
 763 
 764   WorkerState::setCurrentState(WorkerState::RESTARTING);
 765 
 766   RestoreTarget *t = independentProcessTreeRoots.begin()->second;
 767   JASSERT(t->pid() != 0);
 768   JASSERT(!t->noCoordinator() || allowedModes == COORD_ANY)
 769     .Text("Process had no coordinator prior to checkpoint;\n"
 770           "  but either --join or --new-coordinator was specified.");
 771   t->createProcess(true);
 772   JASSERT(false).Text("unreachable");
 773   return -1;
 774 }

/* [<][>][^][v][top][bottom][index][help] */