root/writeckpt.cpp
/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- mtcp_writememoryareas
- remap_nscd_areas
- mtcp_get_next_page_range
- mtcp_write_non_rwx_and_anonymous_pages
- writememoryarea
1 /****************************************************************************
2 * Copyright (C) 2006-2013 by Jason Ansel, Kapil Arya, and Gene Cooperman *
3 * jansel@csail.mit.edu, kapil@ccs.neu.edu, and gene@ccs.neu.edu *
4 * *
5 * This file is part of the DMTCP. *
6 * *
7 * DMTCP is free software: you can redistribute it and/or *
8 * modify it under the terms of the GNU Lesser General Public License as *
9 * published by the Free Software Foundation, either version 3 of the *
10 * License, or (at your option) any later version. *
11 * *
12 * DMTCP is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU Lesser General Public License for more details. *
16 * *
17 * You should have received a copy of the GNU Lesser General Public *
18 * License along with DMTCP:dmtcp/src. If not, see *
19 * <http://www.gnu.org/licenses/>. *
20 ****************************************************************************/
21 #include <errno.h>
22 #include <sched.h>
23 #include <signal.h>
24 #include <stdarg.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/mman.h>
29 #include <sys/resource.h>
30 #include <sys/stat.h>
31 #include <sys/fcntl.h>
32 #include "dmtcp.h"
33 #include "processinfo.h"
34 #include "procmapsarea.h"
35 #include "procselfmaps.h"
36 #include "jassert.h"
37 #include "util.h"
38
39 #define DEV_ZERO_DELETED_STR "/dev/zero (deleted)"
40 #define DEV_NULL_DELETED_STR "/dev/null (deleted)"
41
42 /* Shared memory regions for Direct Rendering Infrastructure */
43 #define DEV_DRI_SHMEM "/dev/dri/card"
44
45 #define DELETED_FILE_SUFFIX " (deleted)"
46
47
48 #define _real_open NEXT_FNC(open)
49 #define _real_close NEXT_FNC(close)
50
51 using namespace dmtcp;
52
53 EXTERNC int dmtcp_infiniband_enabled(void) __attribute__((weak));
54
55 static const int END_OF_NSCD_AREAS = -1;
56
57 ProcSelfMaps *procSelfMaps = NULL;
58 vector<ProcMapsArea> *nscdAreas = NULL;
59
60
61 /* Internal routines */
62 //static void sync_shared_mem(void);
63 static void writememoryarea (int fd, Area *area,
64 int stack_was_seen);
65
66 static void remap_nscd_areas(const vector<ProcMapsArea> & areas);
67
68 /*****************************************************************************
69 *
70 * This routine is called from time-to-time to write a new checkpoint file.
71 * It assumes all the threads are suspended.
72 *
73 * NOTE: Any memory allocated in this function should be released explicitely
74 * during the next ckpt cycle. Otherwise, on restart, we never come back to
75 * this function which can cause memory leaks.
76 *
77 *****************************************************************************/
78
79 void mtcp_writememoryareas(int fd)
80 {
81 Area area;
82 //DeviceInfo dev_info;
83 int stack_was_seen = 0;
84
85 JTRACE("Performing checkpoint.");
86
87 // Here we want to sync the shared memory pages with the backup files
88 // FIXME: Why do we need this?
89 //JTRACE("syncing shared memory with backup files");
90 //sync_shared_mem();
91
92 /**************************************************************************/
93 /* We can't do any more mallocing at this point because malloc stuff is */
94 /* outside the limits of the libmtcp.so image, so it won't get */
95 /* checkpointed, and it's possible that we would checkpoint an */
96 /* inconsistent state. See note in restoreverything routine. */
97 /**************************************************************************/
98
99 {
100 if (nscdAreas == NULL) {
101 nscdAreas = new vector<ProcMapsArea>();
102 }
103 nscdAreas->clear();
104 // This block is to ensure that the object is deleted as soon as we leave
105 // this block.
106 ProcSelfMaps procSelfMaps;
107 // Preprocess memory regions as needed.
108 while (procSelfMaps.getNextArea(&area)) {
109 if (Util::isNscdArea(area)) {
110 /* Special Case Handling: nscd is enabled*/
111 JNOTE("NSCD daemon shared memory area present.\n"
112 " MTCP will now try to remap this area in read/write mode as\n"
113 " private (zero pages), so that glibc will automatically\n"
114 " stop using NSCD or ask NSCD daemon for new shared area\n")
115 (area.name);
116
117 nscdAreas->push_back(area);
118 }
119 }
120 }
121
122 if (procSelfMaps != NULL) {
123 // We need to explicitly delete this object here because on restart, we
124 // never get back to this function and the object is never released.
125 delete procSelfMaps;
126 };
127
128 /* Finally comes the memory contents */
129 procSelfMaps = new ProcSelfMaps();
130 while (procSelfMaps->getNextArea(&area)) {
131 // TODO(kapil): Verify that we are not doing any operation that might
132 // result in a change of memory layout. For example, a call to JALLOC_NEW
133 // will invoke mmap if the JAlloc arena is full. Similarly, for STL objects
134 // such as vector and string.
135
136 if ((uint64_t)area.addr == ProcessInfo::instance().restoreBufAddr()) {
137 JASSERT(area.size == ProcessInfo::instance().restoreBufLen())
138 ((void*) area.addr) (area.size) (ProcessInfo::instance().restoreBufLen());
139 continue;
140 }
141
142 /* Original comment: Skip anything in kernel address space ---
143 * beats me what's at FFFFE000..FFFFFFFF - we can't even read it;
144 * Added: That's the vdso section for earlier Linux 2.6 kernels. For later
145 * 2.6 kernels, vdso occurs at an earlier address. If it's unreadable,
146 * then we simply won't copy it. But let's try to read all areas, anyway.
147 * **COMMENTED OUT:** if (area.addr >= HIGHEST_VA) continue;
148 */
149 /* If it's readable, but it's VDSO, it will be dangerous to restore it.
150 * In 32-bit mode later Red Hat RHEL Linux 2.6.9 releases use 0xffffe000,
151 * the last page of virtual memory. Note 0xffffe000 >= HIGHEST_VA
152 * implies we're in 32-bit mode.
153 */
154 if (area.addr >= HIGHEST_VA && area.addr == (VA)0xffffe000)
155 continue;
156 #ifdef __x86_64__
157 /* And in 64-bit mode later Red Hat RHEL Linux 2.6.9 releases
158 * use 0xffffffffff600000 for VDSO.
159 */
160 if (area.addr >= HIGHEST_VA && area.addr == (VA)0xffffffffff600000)
161 continue;
162 #endif
163
164 /* Skip anything that has no read or execute permission. This occurs
165 * on one page in a Linux 2.6.9 installation. No idea why. This code
166 * would also take care of kernel sections since we don't have read/execute
167 * permission there.
168 *
169 * EDIT: We should only skip the "---p" section for the shared libraries.
170 * Anonymous memory areas with no rwx permission should be saved regardless
171 * as the process might have removed the permissions temporarily and might
172 * want to use it later.
173 *
174 * This happens, for example, with libpthread where the pthread library
175 * tries to recycle thread stacks. When a thread exits, libpthread will
176 * remove the access permissions from the thread stack and later, when a
177 * new thread is created, it will provide the proper permission to this
178 * area and use it as the thread stack.
179 *
180 * If we do not restore this area on restart, the area might be returned by
181 * some mmap() call. Later on, when pthread wants to use this area, it will
182 * just try to use this area which now belongs to some other object. Even
183 * worse, the other object can then call munmap() on that area after
184 * libpthread started using it as thread stack causing the parts of thread
185 * stack getting munmap()'d from the memory resulting in a SIGSEGV.
186 *
187 * We suspect that libpthread is using mmap() instead of mprotect to change
188 * the permission from "---p" to "rw-p".
189 */
190
191 if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE)) &&
192 area.name[0] != '\0') {
193 continue;
194 }
195
196 if (Util::strStartsWith(area.name, DEV_ZERO_DELETED_STR) ||
197 Util::strStartsWith(area.name, DEV_NULL_DELETED_STR)) {
198 /* If the process has an area labelled as "/dev/zero (deleted)", we mark
199 * the area as Anonymous and save the contents to the ckpt image file.
200 * If this area has a MAP_SHARED attribute, it should be replaced with
201 * MAP_PRIVATE and we won't do any harm because, the /dev/zero file is
202 * an absolute source and sink. Anything written to it will be
203 * discarded and anything read from it will be all zeros.
204 * The following call to mmap will create "/dev/zero (deleted)" area
205 * mmap(addr, size, protection, MAP_SHARED | MAP_ANONYMOUS, 0, 0)
206 *
207 * The above explanation also applies to "/dev/null (deleted)"
208 */
209 JTRACE("saving area as Anonymous") (area.name);
210 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
211 area.name[0] = '\0';
212 } else if (Util::isSysVShmArea(area)) {
213 JTRACE("saving area as Anonymous") (area.name);
214 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
215 area.name[0] = '\0';
216 } else if (Util::isNscdArea(area)) {
217 /* Special Case Handling: nscd is enabled*/
218 area.prot = PROT_READ | PROT_WRITE | MTCP_PROT_ZERO_PAGE;
219 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
220 Util::writeAll(fd, &area, sizeof(area));
221 continue;
222 } else if (Util::isIBShmArea(area)) {
223 // TODO: Don't checkpoint infiniband shared area for now.
224 continue;
225 } else if (Util::strEndsWith(area.name, DELETED_FILE_SUFFIX)) {
226 /* Deleted File */
227 } else if (area.name[0] == '/' && strstr(&area.name[1], "/") != NULL) {
228 /* If an absolute pathname
229 * Posix and SysV shared memory segments can be mapped as /XYZ
230 */
231 }
232
233 /* Force the anonymous flag if it's a private writeable section, as the
234 * data has probably changed from the contents of the original images.
235 */
236
237 /* We also do this for read-only private sections as it's possible
238 * to modify a page there, too (via mprotect).
239 */
240
241 if ((area.flags & MAP_PRIVATE) /*&& (area.prot & PROT_WRITE)*/) {
242 area.flags |= MAP_ANONYMOUS;
243 }
244
245 /* Only write this image if it is not CS_RESTOREIMAGE.
246 * Skip any mapping for this image - it got saved as CS_RESTOREIMAGE
247 * at the beginning.
248 */
249
250 if (strstr (area.name, "[stack]"))
251 stack_was_seen = 1;
252 // the whole thing comes after the restore image
253 writememoryarea(fd, &area, stack_was_seen);
254 }
255
256 // Release the memory.
257 delete procSelfMaps;
258 procSelfMaps = NULL;
259
260 /* It's now safe to do this, since we're done using mtcp_readmapsline() */
261 remap_nscd_areas(*nscdAreas);
262
263 area.addr = NULL; // End of data
264 area.size = -1; // End of data
265 Util::writeAll(fd, &area, sizeof(area));
266
267 /* That's all folks */
268 JASSERT(_real_close (fd) == 0);
269 }
270
271 static void remap_nscd_areas(const vector<ProcMapsArea>& areas)
272 {
273 for (size_t i = 0; i < areas.size(); i++) {
274 JASSERT(munmap(areas[i].addr, areas[i].size) == 0) (JASSERT_ERRNO)
275 .Text("error unmapping NSCD shared area");
276 JASSERT(mmap(areas[i].addr, areas[i].size, areas[i].prot,
277 MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0) != MAP_FAILED)
278 (JASSERT_ERRNO) .Text("error remapping NSCD shared area.");
279 }
280 }
281
282
283 /* This function returns a range of zero or non-zero pages. If the first page
284 * is non-zero, it searches for all contiguous non-zero pages and returns them.
285 * If the first page is all-zero, it searches for contiguous zero pages and
286 * returns them.
287 */
288 static void mtcp_get_next_page_range(Area *area, size_t *size, int *is_zero)
289 {
290 char *pg;
291 char *prevAddr;
292 size_t count = 0;
293 const size_t one_MB = (1024 * 1024);
294 if (area->size < one_MB) {
295 *size = area->size;
296 *is_zero = 0;
297 return;
298 }
299 *size = one_MB;
300 *is_zero = Util::areZeroPages(area->addr, one_MB / MTCP_PAGE_SIZE);
301 prevAddr = area->addr;
302 for (pg = area->addr + one_MB;
303 pg < area->addr + area->size;
304 pg += one_MB) {
305 size_t minsize = MIN(one_MB, (size_t)(area->addr + area->size - pg));
306 if (*is_zero != Util::areZeroPages(pg, minsize / MTCP_PAGE_SIZE)) {
307 break;
308 }
309 *size += minsize;
310 if (*is_zero && ++count % 10 == 0) { // madvise every 10MB
311 if (madvise(prevAddr, area->addr + *size - prevAddr,
312 MADV_DONTNEED) == -1) {
313 JNOTE("error doing madvise(..., MADV_DONTNEED)")
314 (JASSERT_ERRNO) ((void*)area->addr) ((int)*size);
315 prevAddr = pg;
316 }
317 }
318 }
319 }
320
321 static void mtcp_write_non_rwx_and_anonymous_pages(int fd, Area *orig_area)
322 {
323 Area area = *orig_area;
324 /* Now give read permission to the anonymous pages that do not have read
325 * permission. We should remove the permission as soon as we are done
326 * writing the area to the checkpoint image
327 *
328 * NOTE: Changing the permission here can results in two adjacent memory
329 * areas to become one (merged), if they have similar permissions. This can
330 * results in a modified /proc/self/maps file. We shouldn't get affected by
331 * the changes because we are going to remove the PROT_READ later in the
332 * code and that should reset the /proc/self/maps files to its original
333 * condition.
334 */
335 JASSERT(orig_area->name[0] == '\0');
336
337 if ((orig_area->prot & PROT_READ) == 0) {
338 JASSERT(mprotect(orig_area->addr, orig_area->size,
339 orig_area->prot | PROT_READ) == 0)
340 (JASSERT_ERRNO) (orig_area->size) (orig_area->addr)
341 .Text("error adding PROT_READ to mem region");
342 }
343
344 while (area.size > 0) {
345 size_t size;
346 int is_zero;
347 Area a = area;
348 if (dmtcp_infiniband_enabled && dmtcp_infiniband_enabled()) {
349 size = area.size;
350 is_zero = 0;
351 } else {
352 mtcp_get_next_page_range(&a, &size, &is_zero);
353 }
354
355 a.prot |= is_zero ? MTCP_PROT_ZERO_PAGE : 0;
356 a.size = size;
357
358 Util::writeAll(fd, &a, sizeof(a));
359 if (!is_zero) {
360 Util::writeAll(fd, a.addr, a.size);
361 } else {
362 if (madvise(a.addr, a.size, MADV_DONTNEED) == -1) {
363 JNOTE("error doing madvise(..., MADV_DONTNEED)")
364 (JASSERT_ERRNO) (a.addr) ((int)a.size);
365 }
366 }
367 area.addr += size;
368 area.size -= size;
369 }
370
371 /* Now remove the PROT_READ from the area if it didn't have it originally
372 */
373 if ((orig_area->prot & PROT_READ) == 0) {
374 JASSERT(mprotect(orig_area->addr, orig_area->size, orig_area->prot) == 0)
375 (JASSERT_ERRNO) (orig_area->addr) (orig_area->size)
376 .Text("error removing PROT_READ from mem region.");
377 }
378 }
379
380 static void writememoryarea (int fd, Area *area, int stack_was_seen)
381 {
382 void *addr = area->addr;
383
384 if (!(area -> flags & MAP_ANONYMOUS))
385 JTRACE("save region") (addr) (area->size) (area->name) (area->offset);
386 else if (area -> name[0] == '\0')
387 JTRACE("save anonymous") (addr) (area->size);
388 else
389 JTRACE("save anonymous") (addr) (area->size) (area->name) (area->offset);
390
391 if ((area -> name[0]) == '\0') {
392 char *brk = (char*)sbrk(0);
393 if (brk > area -> addr && brk <= area -> addr + area -> size)
394 strcpy(area -> name, "[heap]");
395 }
396
397 if (area->size == 0) {
398 /* Kernel won't let us munmap this. But we don't need to restore it. */
399 JTRACE("skipping over [stack] segment (not the orig stack)")
400 (addr) (area->size);
401 } else if (0 == strcmp(area -> name, "[vsyscall]") ||
402 0 == strcmp(area -> name, "[vectors]") ||
403 0 == strcmp(area -> name, "[vvar]") ||
404 0 == strcmp(area -> name, "[vdso]")) {
405 JTRACE("skipping over memory special section")
406 (area->name) (addr) (area->size);
407 } else if (area->prot == 0 ||
408 (area->name[0] == '\0' &&
409 ((area->flags & MAP_ANONYMOUS) != 0) &&
410 ((area->flags & MAP_PRIVATE) != 0))) {
411 /* Detect zero pages and do not write them to ckpt image.
412 * Currently, we detect zero pages in non-rwx mapping and anonymous
413 * mappings only
414 */
415 mtcp_write_non_rwx_and_anonymous_pages(fd, area);
416 } else {
417 /* Anonymous sections need to have their data copied to the file,
418 * as there is no file that contains their data
419 * We also save shared files to checkpoint file to handle shared memory
420 * implemented with backing files
421 */
422 JASSERT((area->flags & MAP_ANONYMOUS) || (area->flags & MAP_SHARED));
423 Util::writeAll(fd, area, sizeof(*area));
424 Util::writeAll(fd, area->addr, area->size);
425 }
426 }