root/writeckpt.cpp

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mtcp_writememoryareas
  2. remap_nscd_areas
  3. mtcp_get_next_page_range
  4. mtcp_write_non_rwx_and_anonymous_pages
  5. writememoryarea

   1 /****************************************************************************
   2  *   Copyright (C) 2006-2013 by Jason Ansel, Kapil Arya, and Gene Cooperman *
   3  *   jansel@csail.mit.edu, kapil@ccs.neu.edu, and gene@ccs.neu.edu          *
   4  *                                                                          *
   5  *   This file is part of the DMTCP.                                        *
   6  *                                                                          *
   7  *  DMTCP is free software: you can redistribute it and/or                  *
   8  *  modify it under the terms of the GNU Lesser General Public License as   *
   9  *  published by the Free Software Foundation, either version 3 of the      *
  10  *  License, or (at your option) any later version.                         *
  11  *                                                                          *
  12  *  DMTCP is distributed in the hope that it will be useful,                *
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of          *
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           *
  15  *  GNU Lesser General Public License for more details.                     *
  16  *                                                                          *
  17  *  You should have received a copy of the GNU Lesser General Public        *
  18  *  License along with DMTCP:dmtcp/src.  If not, see                        *
  19  *  <http://www.gnu.org/licenses/>.                                         *
  20  ****************************************************************************/
  21 #include <errno.h>
  22 #include <sched.h>
  23 #include <signal.h>
  24 #include <stdarg.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <sys/mman.h>
  29 #include <sys/resource.h>
  30 #include <sys/stat.h>
  31 #include <sys/fcntl.h>
  32 #include "dmtcp.h"
  33 #include "processinfo.h"
  34 #include "procmapsarea.h"
  35 #include "procselfmaps.h"
  36 #include "jassert.h"
  37 #include "util.h"
  38 
  39 #define DEV_ZERO_DELETED_STR "/dev/zero (deleted)"
  40 #define DEV_NULL_DELETED_STR "/dev/null (deleted)"
  41 
  42 /* Shared memory regions for Direct Rendering Infrastructure */
  43 #define DEV_DRI_SHMEM "/dev/dri/card"
  44 
  45 #define DELETED_FILE_SUFFIX " (deleted)"
  46 
  47 
  48 #define _real_open NEXT_FNC(open)
  49 #define _real_close NEXT_FNC(close)
  50 
  51 using namespace dmtcp;
  52 
  53 EXTERNC int dmtcp_infiniband_enabled(void) __attribute__((weak));
  54 
  55 static const int END_OF_NSCD_AREAS = -1;
  56 
  57 ProcSelfMaps *procSelfMaps = NULL;
  58 vector<ProcMapsArea> *nscdAreas = NULL;
  59 
  60 
  61 /* Internal routines */
  62 //static void sync_shared_mem(void);
  63 static void writememoryarea (int fd, Area *area,
  64                              int stack_was_seen);
  65 
  66 static void remap_nscd_areas(const vector<ProcMapsArea> & areas);
  67 
  68 /*****************************************************************************
  69  *
  70  *  This routine is called from time-to-time to write a new checkpoint file.
  71  *  It assumes all the threads are suspended.
  72  *
  73  *  NOTE: Any memory allocated in this function should be released explicitely
  74  *  during the next ckpt cycle. Otherwise, on restart, we never come back to
  75  *  this function which can cause memory leaks.
  76  *
  77  *****************************************************************************/
  78 
  79 void mtcp_writememoryareas(int fd)
  80 {
  81   Area area;
  82   //DeviceInfo dev_info;
  83   int stack_was_seen = 0;
  84 
  85   JTRACE("Performing checkpoint.");
  86 
  87   // Here we want to sync the shared memory pages with the backup files
  88   // FIXME: Why do we need this?
  89   //JTRACE("syncing shared memory with backup files");
  90   //sync_shared_mem();
  91 
  92   /**************************************************************************/
  93   /* We can't do any more mallocing at this point because malloc stuff is   */
  94   /* outside the limits of the libmtcp.so image, so it won't get            */
  95   /* checkpointed, and it's possible that we would checkpoint an            */
  96   /* inconsistent state.  See note in restoreverything routine.             */
  97   /**************************************************************************/
  98 
  99   {
 100     if (nscdAreas == NULL) {
 101       nscdAreas = new vector<ProcMapsArea>();
 102     }
 103     nscdAreas->clear();
 104     // This block is to ensure that the object is deleted as soon as we leave
 105     // this block.
 106     ProcSelfMaps procSelfMaps;
 107     // Preprocess memory regions as needed.
 108     while (procSelfMaps.getNextArea(&area)) {
 109       if (Util::isNscdArea(area)) {
 110         /* Special Case Handling: nscd is enabled*/
 111         JNOTE("NSCD daemon shared memory area present.\n"
 112             "  MTCP will now try to remap this area in read/write mode as\n"
 113             "  private (zero pages), so that glibc will automatically\n"
 114             "  stop using NSCD or ask NSCD daemon for new shared area\n")
 115           (area.name);
 116 
 117         nscdAreas->push_back(area);
 118       }
 119     }
 120   }
 121 
 122   if (procSelfMaps != NULL) {
 123     // We need to explicitly delete this object here because on restart, we
 124     // never get back to this function and the object is never released.
 125     delete procSelfMaps;
 126   };
 127 
 128   /* Finally comes the memory contents */
 129   procSelfMaps = new ProcSelfMaps();
 130   while (procSelfMaps->getNextArea(&area)) {
 131     // TODO(kapil): Verify that we are not doing any operation that might
 132     // result in a change of memory layout. For example, a call to JALLOC_NEW
 133     // will invoke mmap if the JAlloc arena is full. Similarly, for STL objects
 134     // such as vector and string.
 135 
 136     if ((uint64_t)area.addr == ProcessInfo::instance().restoreBufAddr()) {
 137       JASSERT(area.size == ProcessInfo::instance().restoreBufLen())
 138         ((void*) area.addr) (area.size) (ProcessInfo::instance().restoreBufLen());
 139       continue;
 140     }
 141 
 142     /* Original comment:  Skip anything in kernel address space ---
 143      *   beats me what's at FFFFE000..FFFFFFFF - we can't even read it;
 144      * Added: That's the vdso section for earlier Linux 2.6 kernels.  For later
 145      *  2.6 kernels, vdso occurs at an earlier address.  If it's unreadable,
 146      *  then we simply won't copy it.  But let's try to read all areas, anyway.
 147      * **COMMENTED OUT:** if (area.addr >= HIGHEST_VA) continue;
 148      */
 149     /* If it's readable, but it's VDSO, it will be dangerous to restore it.
 150      * In 32-bit mode later Red Hat RHEL Linux 2.6.9 releases use 0xffffe000,
 151      * the last page of virtual memory.  Note 0xffffe000 >= HIGHEST_VA
 152      * implies we're in 32-bit mode.
 153      */
 154     if (area.addr >= HIGHEST_VA && area.addr == (VA)0xffffe000)
 155       continue;
 156 #ifdef __x86_64__
 157     /* And in 64-bit mode later Red Hat RHEL Linux 2.6.9 releases
 158      * use 0xffffffffff600000 for VDSO.
 159      */
 160     if (area.addr >= HIGHEST_VA && area.addr == (VA)0xffffffffff600000)
 161       continue;
 162 #endif
 163 
 164     /* Skip anything that has no read or execute permission.  This occurs
 165      * on one page in a Linux 2.6.9 installation.  No idea why.  This code
 166      * would also take care of kernel sections since we don't have read/execute
 167      * permission there.
 168      *
 169      * EDIT: We should only skip the "---p" section for the shared libraries.
 170      * Anonymous memory areas with no rwx permission should be saved regardless
 171      * as the process might have removed the permissions temporarily and might
 172      * want to use it later.
 173      *
 174      * This happens, for example, with libpthread where the pthread library
 175      * tries to recycle thread stacks. When a thread exits, libpthread will
 176      * remove the access permissions from the thread stack and later, when a
 177      * new thread is created, it will provide the proper permission to this
 178      * area and use it as the thread stack.
 179      *
 180      * If we do not restore this area on restart, the area might be returned by
 181      * some mmap() call. Later on, when pthread wants to use this area, it will
 182      * just try to use this area which now belongs to some other object. Even
 183      * worse, the other object can then call munmap() on that area after
 184      * libpthread started using it as thread stack causing the parts of thread
 185      * stack getting munmap()'d from the memory resulting in a SIGSEGV.
 186      *
 187      * We suspect that libpthread is using mmap() instead of mprotect to change
 188      * the permission from "---p" to "rw-p".
 189      */
 190 
 191     if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE)) &&
 192         area.name[0] != '\0') {
 193       continue;
 194     }
 195 
 196     if (Util::strStartsWith(area.name, DEV_ZERO_DELETED_STR) ||
 197         Util::strStartsWith(area.name, DEV_NULL_DELETED_STR)) {
 198       /* If the process has an area labelled as "/dev/zero (deleted)", we mark
 199        *   the area as Anonymous and save the contents to the ckpt image file.
 200        * If this area has a MAP_SHARED attribute, it should be replaced with
 201        *   MAP_PRIVATE and we won't do any harm because, the /dev/zero file is
 202        *   an absolute source and sink. Anything written to it will be
 203        *   discarded and anything read from it will be all zeros.
 204        * The following call to mmap will create "/dev/zero (deleted)" area
 205        *         mmap(addr, size, protection, MAP_SHARED | MAP_ANONYMOUS, 0, 0)
 206        *
 207        * The above explanation also applies to "/dev/null (deleted)"
 208        */
 209       JTRACE("saving area as Anonymous") (area.name);
 210       area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
 211       area.name[0] = '\0';
 212     } else if (Util::isSysVShmArea(area)) {
 213       JTRACE("saving area as Anonymous") (area.name);
 214       area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
 215       area.name[0] = '\0';
 216     } else if (Util::isNscdArea(area)) {
 217       /* Special Case Handling: nscd is enabled*/
 218       area.prot = PROT_READ | PROT_WRITE | MTCP_PROT_ZERO_PAGE;
 219       area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
 220       Util::writeAll(fd, &area, sizeof(area));
 221       continue;
 222     } else if (Util::isIBShmArea(area)) {
 223       // TODO: Don't checkpoint infiniband shared area for now.
 224       continue;
 225     } else if (Util::strEndsWith(area.name, DELETED_FILE_SUFFIX)) {
 226       /* Deleted File */
 227     } else if (area.name[0] == '/' && strstr(&area.name[1], "/") != NULL) {
 228       /* If an absolute pathname
 229        * Posix and SysV shared memory segments can be mapped as /XYZ
 230        */
 231     }
 232 
 233     /* Force the anonymous flag if it's a private writeable section, as the
 234      * data has probably changed from the contents of the original images.
 235      */
 236 
 237     /* We also do this for read-only private sections as it's possible
 238      * to modify a page there, too (via mprotect).
 239      */
 240 
 241     if ((area.flags & MAP_PRIVATE) /*&& (area.prot & PROT_WRITE)*/) {
 242       area.flags |= MAP_ANONYMOUS;
 243     }
 244 
 245     /* Only write this image if it is not CS_RESTOREIMAGE.
 246      * Skip any mapping for this image - it got saved as CS_RESTOREIMAGE
 247      * at the beginning.
 248      */
 249 
 250     if (strstr (area.name, "[stack]"))
 251       stack_was_seen = 1;
 252     // the whole thing comes after the restore image
 253     writememoryarea(fd, &area, stack_was_seen);
 254   }
 255 
 256   // Release the memory.
 257   delete procSelfMaps;
 258   procSelfMaps = NULL;
 259 
 260   /* It's now safe to do this, since we're done using mtcp_readmapsline() */
 261   remap_nscd_areas(*nscdAreas);
 262 
 263   area.addr = NULL; // End of data
 264   area.size = -1; // End of data
 265   Util::writeAll(fd, &area, sizeof(area));
 266 
 267   /* That's all folks */
 268   JASSERT(_real_close (fd) == 0);
 269 }
 270 
 271 static void remap_nscd_areas(const vector<ProcMapsArea>& areas)
 272 {
 273   for (size_t i = 0; i < areas.size(); i++) {
 274     JASSERT(munmap(areas[i].addr, areas[i].size) == 0) (JASSERT_ERRNO)
 275       .Text("error unmapping NSCD shared area");
 276     JASSERT(mmap(areas[i].addr, areas[i].size, areas[i].prot,
 277             MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0) != MAP_FAILED)
 278       (JASSERT_ERRNO) .Text("error remapping NSCD shared area.");
 279   }
 280 }
 281 
 282 
 283 /* This function returns a range of zero or non-zero pages. If the first page
 284  * is non-zero, it searches for all contiguous non-zero pages and returns them.
 285  * If the first page is all-zero, it searches for contiguous zero pages and
 286  * returns them.
 287  */
 288 static void mtcp_get_next_page_range(Area *area, size_t *size, int *is_zero)
 289 {
 290   char *pg;
 291   char *prevAddr;
 292   size_t count = 0;
 293   const size_t one_MB = (1024 * 1024);
 294   if (area->size < one_MB) {
 295     *size = area->size;
 296     *is_zero = 0;
 297     return;
 298   }
 299   *size = one_MB;
 300   *is_zero = Util::areZeroPages(area->addr, one_MB / MTCP_PAGE_SIZE);
 301   prevAddr = area->addr;
 302   for (pg = area->addr + one_MB;
 303        pg < area->addr + area->size;
 304        pg += one_MB) {
 305     size_t minsize = MIN(one_MB, (size_t)(area->addr + area->size - pg));
 306     if (*is_zero != Util::areZeroPages(pg, minsize / MTCP_PAGE_SIZE)) {
 307       break;
 308     }
 309     *size += minsize;
 310     if (*is_zero && ++count % 10 == 0) { // madvise every 10MB
 311       if (madvise(prevAddr, area->addr + *size - prevAddr,
 312                   MADV_DONTNEED) == -1) {
 313         JNOTE("error doing madvise(..., MADV_DONTNEED)")
 314           (JASSERT_ERRNO) ((void*)area->addr) ((int)*size);
 315         prevAddr = pg;
 316       }
 317     }
 318   }
 319 }
 320 
 321 static void mtcp_write_non_rwx_and_anonymous_pages(int fd, Area *orig_area)
 322 {
 323   Area area = *orig_area;
 324   /* Now give read permission to the anonymous pages that do not have read
 325    * permission. We should remove the permission as soon as we are done
 326    * writing the area to the checkpoint image
 327    *
 328    * NOTE: Changing the permission here can results in two adjacent memory
 329    * areas to become one (merged), if they have similar permissions. This can
 330    * results in a modified /proc/self/maps file. We shouldn't get affected by
 331    * the changes because we are going to remove the PROT_READ later in the
 332    * code and that should reset the /proc/self/maps files to its original
 333    * condition.
 334    */
 335   JASSERT(orig_area->name[0] == '\0');
 336 
 337   if ((orig_area->prot & PROT_READ) == 0) {
 338     JASSERT(mprotect(orig_area->addr, orig_area->size,
 339                      orig_area->prot | PROT_READ) == 0)
 340       (JASSERT_ERRNO) (orig_area->size) (orig_area->addr)
 341       .Text("error adding PROT_READ to mem region");
 342   }
 343 
 344   while (area.size > 0) {
 345     size_t size;
 346     int is_zero;
 347     Area a = area;
 348     if (dmtcp_infiniband_enabled && dmtcp_infiniband_enabled()) {
 349       size = area.size;
 350       is_zero = 0;
 351     } else {
 352       mtcp_get_next_page_range(&a, &size, &is_zero);
 353     }
 354 
 355     a.prot |= is_zero ? MTCP_PROT_ZERO_PAGE : 0;
 356     a.size = size;
 357 
 358     Util::writeAll(fd, &a, sizeof(a));
 359     if (!is_zero) {
 360       Util::writeAll(fd, a.addr, a.size);
 361     } else {
 362       if (madvise(a.addr, a.size, MADV_DONTNEED) == -1) {
 363         JNOTE("error doing madvise(..., MADV_DONTNEED)")
 364           (JASSERT_ERRNO) (a.addr) ((int)a.size);
 365       }
 366     }
 367     area.addr += size;
 368     area.size -= size;
 369   }
 370 
 371   /* Now remove the PROT_READ from the area if it didn't have it originally
 372   */
 373   if ((orig_area->prot & PROT_READ) == 0) {
 374     JASSERT(mprotect(orig_area->addr, orig_area->size, orig_area->prot) == 0)
 375       (JASSERT_ERRNO) (orig_area->addr) (orig_area->size)
 376       .Text("error removing PROT_READ from mem region.");
 377   }
 378 }
 379 
 380 static void writememoryarea (int fd, Area *area, int stack_was_seen)
 381 {
 382   void *addr = area->addr;
 383 
 384   if (!(area -> flags & MAP_ANONYMOUS))
 385     JTRACE("save region") (addr) (area->size) (area->name) (area->offset);
 386   else if (area -> name[0] == '\0')
 387     JTRACE("save anonymous") (addr) (area->size);
 388   else
 389     JTRACE("save anonymous") (addr) (area->size) (area->name) (area->offset);
 390 
 391   if ((area -> name[0]) == '\0') {
 392     char *brk = (char*)sbrk(0);
 393     if (brk > area -> addr && brk <= area -> addr + area -> size)
 394       strcpy(area -> name, "[heap]");
 395   }
 396 
 397   if (area->size == 0) {
 398     /* Kernel won't let us munmap this.  But we don't need to restore it. */
 399     JTRACE("skipping over [stack] segment (not the orig stack)")
 400       (addr) (area->size);
 401   } else if (0 == strcmp(area -> name, "[vsyscall]") ||
 402              0 == strcmp(area -> name, "[vectors]") ||
 403              0 == strcmp(area -> name, "[vvar]") ||
 404              0 == strcmp(area -> name, "[vdso]")) {
 405     JTRACE("skipping over memory special section")
 406       (area->name) (addr) (area->size);
 407   } else if (area->prot == 0 ||
 408       (area->name[0] == '\0' &&
 409        ((area->flags & MAP_ANONYMOUS) != 0) &&
 410        ((area->flags & MAP_PRIVATE) != 0))) {
 411     /* Detect zero pages and do not write them to ckpt image.
 412      * Currently, we detect zero pages in non-rwx mapping and anonymous
 413      * mappings only
 414      */
 415     mtcp_write_non_rwx_and_anonymous_pages(fd, area);
 416   } else {
 417     /* Anonymous sections need to have their data copied to the file,
 418      *   as there is no file that contains their data
 419      * We also save shared files to checkpoint file to handle shared memory
 420      *   implemented with backing files
 421      */
 422     JASSERT((area->flags & MAP_ANONYMOUS) || (area->flags & MAP_SHARED));
 423     Util::writeAll(fd, area, sizeof(*area));
 424     Util::writeAll(fd, area->addr, area->size);
 425   }
 426 }

/* [<][>][^][v][top][bottom][index][help] */