From 6f8d5435bbb54bf44f73802ada0b351f2f473156 Mon Sep 17 00:00:00 2001 From: prakritigoyal19 Date: Sun, 7 Jun 2020 09:22:53 +0530 Subject: [PATCH 001/122] Add flog to CRIU Change made through this commit: - Include copy of flog as a seperate tree. - Modify the makefile to add and compile flog code. Signed-off-by: prakritigoyal19 --- Makefile | 12 ++- flog/Makefile | 29 ++++++ flog/built-in.S | 4 + flog/include/compiler.h | 71 +++++++++++++ flog/include/flog.h | 9 ++ flog/include/log.h | 17 ++++ flog/include/types.h | 16 +++ flog/include/uapi/flog.h | 149 +++++++++++++++++++++++++++ flog/include/util.h | 37 +++++++ flog/src/Makefile | 5 + flog/src/flog.c | 215 +++++++++++++++++++++++++++++++++++++++ flog/src/main.c | 170 +++++++++++++++++++++++++++++++ flog/tests/test00 | 22 ++++ 13 files changed, 755 insertions(+), 1 deletion(-) create mode 100644 flog/Makefile create mode 100644 flog/built-in.S create mode 100644 flog/include/compiler.h create mode 100644 flog/include/flog.h create mode 100644 flog/include/log.h create mode 100644 flog/include/types.h create mode 100644 flog/include/uapi/flog.h create mode 100644 flog/include/util.h create mode 100644 flog/src/Makefile create mode 100644 flog/src/flog.c create mode 100644 flog/src/main.c create mode 100755 flog/tests/test00 diff --git a/Makefile b/Makefile index ad70800eb5..e67bcc5614 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: flog criu lib crit .PHONY: all # @@ -233,6 +233,15 @@ soccr/built-in.o: $(CONFIG_HEADER) .FORCE $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) +#flog gets used by criu, build it earlier + +flogMakefile: ; +flog%: + $(Q) $(MAKE) $(build)=flog $@ +flog: + $(Q) $(MAKE) $(build)=flog all +.PHONY: flog + # # CRIU building done in own directory # with slightly different rules so we @@ -275,6 +284,7 @@ lib: crit clean mrproper: $(Q) $(MAKE) $(build)=images $@ + $(Q) $(MAKE) $(build)=flog $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ diff --git a/flog/Makefile b/flog/Makefile new file mode 100644 index 0000000000..12255af719 --- /dev/null +++ b/flog/Makefile @@ -0,0 +1,29 @@ +OPTS=-ggdb3 -Wall -Werror +export OPTS + +CFLAGS += -iquote include +CFLAGS += -iquote flog/include +CFLAGS += -iquote flog/include/uapi + +include $(__nmk_dir)msg.mk + +$(eval $(call gen-built-in,src)) + +flog: + $(Q) $(MAKE) $(build)=$(obj)/src all +.PHONY: flog + +clean-flog: + $(call msg-gen, $@) + $(Q) $(MAKE) $(build)=$(obj)/src clean + $(Q) $(RM) built-in.o +.PHONY: clean-flog + +clean: clean-flog +mrproper: clean + +test: + ./tests/test00 + +all-y += flog + diff --git a/flog/built-in.S b/flog/built-in.S new file mode 100644 index 0000000000..26627d0544 --- /dev/null +++ b/flog/built-in.S @@ -0,0 +1,4 @@ +SECTIONS +{ + .rodata : { _rodata_start = . ; *(.rodata*) ; _rodata_end = . ;} +} diff --git a/flog/include/compiler.h b/flog/include/compiler.h new file mode 100644 index 0000000000..3e56eb0e64 --- /dev/null +++ b/flog/include/compiler.h @@ -0,0 +1,71 @@ +#ifndef __COMPILER_H__ +#define __COMPILER_H__ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +#define __section(S) __attribute__ ((__section__(#S))) + +#ifndef __always_inline +# define __always_inline inline __attribute__((always_inline)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef always_inline +# define always_inline __always_inline +#endif + +#ifndef noinline +# define noinline __attribute__((noinline)) +#endif + +#define __aligned(x) __attribute__((aligned(x))) + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) +#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define is_log2(v) (((v) & ((v) - 1)) == 0) + +#endif /* __COMPILER_H__ */ diff --git a/flog/include/flog.h b/flog/include/flog.h new file mode 100644 index 0000000000..f00c20541f --- /dev/null +++ b/flog/include/flog.h @@ -0,0 +1,9 @@ +#ifndef __FLOG_H__ +#define __FLOG_H__ + +#include +#include + +#include "uapi/flog.h" + +#endif /* __FLOG_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h new file mode 100644 index 0000000000..1a165ea9fb --- /dev/null +++ b/flog/include/log.h @@ -0,0 +1,17 @@ +#ifndef __LOG_H__ +#define __LOG_H__ + +#include + +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) + +#if 1 +# define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#else +# define pr_debug(fmt, ...) +#endif + +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) + +#endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h new file mode 100644 index 0000000000..0e15bfbff5 --- /dev/null +++ b/flog/include/types.h @@ -0,0 +1,16 @@ +#ifndef __FLOG_TYPES_H__ +#define __FLOG_TYPES_H__ + +#include +#include + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h new file mode 100644 index 0000000000..2d879110fc --- /dev/null +++ b/flog/include/uapi/flog.h @@ -0,0 +1,149 @@ +#ifndef __UAPI_FLOG_H__ +#define __UAPI_FLOG_H__ + +#include +#include +#include + +/* + * We work with up to 32 arguments in macros here. + * If more provided -- behaviour is undefined. + */ + +/* + * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s + */ +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) + +#define FLOG_PP_ARG_N( _0, _1, _2, _3, _4, \ + _5, _6, _7, _8, _9, \ + _10,_11,_12,_13,_14, \ + _15,_16,_17,_18,_19, \ + _20,_21,_22,_23,_24, \ + _25,_26,_27,_28,_29, \ + _30,_31, N, ...) N + +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, \ + 26, 25, 24, 23, 22, \ + 21, 20, 19, 18, 17, \ + 16, 15, 14, 13, 12, \ + 11, 10, 9, 8, 7, \ + 6, 5, 4, 3, 2, \ + 1, 0 + +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) + +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 + +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) + +#define flog_genbit(ord, n, v, ...) \ + _Generic((v), \ + \ + /* Basic types */ \ + char: 0, \ + signed char: 0, \ + unsigned char: 0, \ + signed short int: 0, \ + unsigned short int: 0, \ + signed int: 0, \ + unsigned int: 0, \ + signed long: 0, \ + unsigned long: 0, \ + signed long long: 0, \ + unsigned long long: 0, \ + \ + /* Not used for a while */ \ + /* float: 12, */ \ + /* double: 13, */ \ + /* long double: 14, */ \ + \ + /* Basic poniters */ \ + char *: (1u << (ord - n - 1)), \ + signed char *: (1u << (ord - n - 1)), \ + unsigned char *: (1u << (ord - n - 1)), \ + signed short int *: 0, \ + unsigned short int *: 0, \ + signed int *: 0, \ + unsigned int *: 0, \ + signed long *: 0, \ + unsigned long *: 0, \ + signed long long *: 0, \ + unsigned long long *: 0, \ + void *: 0, \ + \ + /* Const basic pointers */ \ + const char *: (1u << (ord - n - 1)), \ + const signed char *: (1u << (ord - n - 1)), \ + const unsigned char *: (1u << (ord - n - 1)), \ + const signed short int *: 0, \ + const unsigned short int *: 0, \ + const signed int *: 0, \ + const unsigned int *: 0, \ + const signed long *: 0, \ + const unsigned long *: 0, \ + const signed long long *: 0, \ + const unsigned long long *: 0, \ + const void *: 0, \ + \ + /* Systypes and pointers */ \ + default: -1) + +typedef struct { + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; +} flog_msg_t; + +extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); +void flog_decode_msg(int fdout, const char *format, ...); +extern int flog_decode_all(int fdin, int fdout); + +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), \ + FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) + +int flog_map_buf(int fdout); +int flog_close(int fdout); + +#endif /* __UAPI_FLOG_H__ */ diff --git a/flog/include/util.h b/flog/include/util.h new file mode 100644 index 0000000000..17a4d77997 --- /dev/null +++ b/flog/include/util.h @@ -0,0 +1,37 @@ +#ifndef __UTIL_H__ +#define __UTIL_H__ + +#include +#include + +#include "log.h" +#include "types.h" + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) do { if (p) free(p); } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +#endif /* __UTIL_H__ */ diff --git a/flog/src/Makefile b/flog/src/Makefile new file mode 100644 index 0000000000..ee73ea7252 --- /dev/null +++ b/flog/src/Makefile @@ -0,0 +1,5 @@ +ccflags-y += -DCONFIG_X86_64 -iquote ./include $(OPTS) +ldflags-y += -r + +#obj-y += main.o +obj-y += flog.o diff --git a/flog/src/flog.c b/flog/src/flog.c new file mode 100644 index 0000000000..533625de61 --- /dev/null +++ b/flog/src/flog.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include + +//#include + +#include "uapi/flog.h" +#include "util.h" + +#define MAGIC 0xABCDABCD + +#define BUF_SIZE (1<<20) +static char _mbuf[BUF_SIZE]; +static char *mbuf = _mbuf; +static char *fbuf; +static uint64_t fsize; +static uint64_t mbuf_size = sizeof(_mbuf); + +/*int flog_decode_all(int fdin, int fdout) +{ + flog_msg_t *m = (void *)mbuf; + ffi_type *args[34] = { + [0] = &ffi_type_sint, + [1] = &ffi_type_pointer, + [2 ... 33] = &ffi_type_slong + }; + void *values[34]; + ffi_cif cif; + ffi_arg rc; + size_t i, ret; + char *fmt; + + values[0] = (void *)&fdout; + + while (1) { + ret = read(fdin, mbuf, sizeof(m)); + if (ret == 0) + break; + if (ret < 0) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + if (m->magic != MAGIC) { + fprintf(stderr, "The log file was not properly closed\n"); + break; + } + ret = m->size - sizeof(m); + if (m->size > mbuf_size) { + fprintf(stderr, "The buffer is too small"); + return -1; + } + if (read(fdin, mbuf + sizeof(m), ret) != ret) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + + fmt = mbuf + m->fmt; + values[1] = &fmt; + + for (i = 0; i < m->nargs; i++) { + values[i + 2] = (void *)&m->args[i]; + if (m->mask & (1u << i)) { + m->args[i] = (long)(mbuf + m->args[i]); + } + } + + if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, m->nargs + 2, + &ffi_type_sint, args) == FFI_OK) + ffi_call(&cif, FFI_FN(dprintf), &rc, values); + } + return 0; +}*/ + +static int flog_enqueue(flog_msg_t *m) +{ + if (write(1, m, m->size) != m->size) { + fprintf(stderr, "Unable to write a message\n"); + return -1; + } + return 0; +} + +/*extern char *rodata_start; +extern char *rodata_end; +*/ +/* Pre-allocate a buffer in a file and map it into memory. */ +int flog_map_buf(int fdout) +{ + uint64_t off = 0; + void *addr; + + /* + * Two buffers are mmaped into memory. A new one is mapped when a first + * one is completly filled. + */ + if (fbuf && (mbuf - fbuf < BUF_SIZE)) + return 0; + + if (fbuf) { + if (munmap(fbuf, BUF_SIZE * 2)) { + fprintf(stderr, "Unable to unmap a buffer: %m"); + return -1; + } + off = mbuf - fbuf - BUF_SIZE; + fbuf = NULL; + } + + if (fsize == 0) + fsize += BUF_SIZE; + fsize += BUF_SIZE; + + if (ftruncate(fdout, fsize)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + + if (!fbuf) + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, + MAP_FILE | MAP_SHARED, fdout, fsize - 2 * BUF_SIZE); + else + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, + BUF_SIZE * 2, MREMAP_FIXED, fbuf); + if (addr == MAP_FAILED) { + fprintf(stderr, "Unable to map a buffer: %m"); + return -1; + } + + fbuf = addr; + mbuf = fbuf + off; + mbuf_size = 2 * BUF_SIZE; + + return 0; +} + +int flog_close(int fdout) +{ + if (mbuf == _mbuf) + return 0; + + munmap(fbuf, BUF_SIZE * 2); + + if (ftruncate(fdout, fsize - 2 * BUF_SIZE + mbuf - fbuf)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + return 0; +} + +int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...) +{ + flog_msg_t *m; + va_list argptr; + char *str_start, *p; + size_t i; + + if (mbuf != _mbuf && flog_map_buf(fdout)) + return -1; + + m = (void *) mbuf; + + m->nargs = nargs; + m->mask = mask; + + str_start = (void *)m->args + sizeof(m->args[0]) * nargs; + p = memccpy(str_start, format, 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->fmt = str_start - mbuf; + str_start = p; + + va_start(argptr, format); + for (i = 0; i < nargs; i++) { + m->args[i] = (long)va_arg(argptr, long); + /* + * If we got a string, we should either + * reference it when in rodata, or make + * a copy (FIXME implement rodata refs). + */ + if (mask & (1u << i)) { + p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->args[i] = str_start - mbuf; + str_start = p; + } + } + va_end(argptr); + m->size = str_start - mbuf; + + /* + * A magic is required to know where we stop writing into a log file, + * if it was not properly closed. The file is mapped into memory, so a + * space in the file is allocated in advance and at the end it can have + * some unused tail. + */ + m->magic = MAGIC; + + m->size = roundup(m->size, 8); + if (mbuf == _mbuf) { + if (flog_enqueue(m)) + return -1; + } else { + mbuf += m->size; + mbuf_size -= m->size; + } + return 0; +} diff --git a/flog/src/main.c b/flog/src/main.c new file mode 100644 index 0000000000..c84e774781 --- /dev/null +++ b/flog/src/main.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "flog.h" + +extern char _rodata_start, _rodata_end; +char *rodata_start = &_rodata_start; +char *rodata_end = &_rodata_end; + +enum { + MODE_BINARY, + MODE_FPRINTF, + MODE_SPRINTF, + MODE_DPRINTF, +}; + +int main(int argc, char *argv[]) +{ + static const char str1[] = "String1 String1"; + static const char str2[] = "string2 string2 string2"; + int fdout = STDOUT_FILENO; + bool use_decoder = false; + int mode = MODE_BINARY; + size_t niter = 100; + int opt, idx; + size_t i; + + static const char short_opts[] = "m:o:di:h"; + static struct option long_opts[] = { + { "mode", required_argument, 0, 'm' }, + { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, + { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, + { }, + }; + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch (opt) { + case 'm': + if (strcmp(optarg, "binary") == 0) { + mode = MODE_BINARY; + } else if (strcmp(optarg, "fprintf") == 0) { + mode = MODE_FPRINTF; + } else if (strcmp(optarg, "sprintf") == 0) { + mode = MODE_SPRINTF; + } else if (strcmp(optarg, "dprintf") == 0) { + mode = MODE_DPRINTF; + } else + goto usage; + break; + case 'o': + if (strcmp(optarg, "stdout") == 0) { + fdout = fileno(stdout); + } else if (strcmp(optarg, "stderr") == 0) { + fdout = fileno(stderr); + } else { + fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fdout < 0) { + fprintf(stderr, "Can't open %s: %s\n", + optarg, strerror(errno)); + exit(1); + } + } + break; + case 'i': + niter = atoi(optarg); + break; + case 'd': + use_decoder = true; + break; + case 'h': + default: + goto usage; + } + } + + switch (mode) { + case MODE_BINARY: + if (use_decoder) + return flog_decode_all(STDIN_FILENO, fdout); + + if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) + return 1; + for (i = 0; i < niter; i++) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2)) + return 1; + if (flog_close(fdout)) + return 1; + break; + case MODE_DPRINTF: + { + for (i = 0; i < niter; i++) { + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + case MODE_FPRINTF: + { + FILE *f = fdopen(fdout, "w"); + + for (i = 0; i < niter; i++) { + fprintf(f, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + fflush(f); + } + fclose(f); + break; + } + case MODE_SPRINTF: + { + static char buf[4096]; + + for (i = 0; i < niter; i++) { + sprintf(buf, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + default: + return 1; + } + + return 0; +usage: + fprintf(stderr, + "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writting results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); + return 1; +} diff --git a/flog/tests/test00 b/flog/tests/test00 new file mode 100755 index 0000000000..a7937e4a18 --- /dev/null +++ b/flog/tests/test00 @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e -x + +echo Map a log file into memory +time ./flog run -i 1000000 -o /tmp/flog.raw.map +echo Write into a log file +time ./flog run -i 1000000 > /tmp/flog.raw +echo Use fprintf +time ./flog run -m fprintf -i 1000000 -o /tmp/flog.fprintf.txt +echo Use dprintf +time ./flog run -m dprintf -i 1000000 -o /tmp/flog.dprintf.txt +echo Use sprintf +time ./flog run -m sprintf -i 1000000 + +time ./flog run -d < /tmp/flog.raw > /tmp/flog.raw.txt +cmp /tmp/flog.raw.txt /tmp/flog.fprintf.txt + +time ./flog run -d < /tmp/flog.raw.map > /tmp/flog.raw.map.txt +cmp /tmp/flog.raw.map.txt /tmp/flog.fprintf.txt + +cmp /tmp/flog.dprintf.txt /tmp/flog.fprintf.txt From bf7b517fdabf05770e615cb2b2e6ccb89be302c7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 28 Sep 2020 07:04:00 +0000 Subject: [PATCH 002/122] flog: Missing varargs init or cleanup (VARARGS) CID 302713 (#1 of 1): Missing varargs init or cleanup (VARARGS) va_end was not called for argptr. Signed-off-by: Adrian Reber --- flog/src/flog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/flog/src/flog.c b/flog/src/flog.c index 533625de61..40cce3fedc 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -186,6 +186,7 @@ int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); if (p == NULL) { fprintf(stderr, "No memory for string argument\n"); + va_end(argptr); return -1; } m->args[i] = str_start - mbuf; From 09348a277f42f0f516d8b4c66e621e2a7aae3a0e Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 4 Aug 2021 07:27:07 +0000 Subject: [PATCH 003/122] Run 'make indent' on 'flog/' Separate commit for easier criu-dev <-> master transfer. Acked-by: Mike Rapoport Signed-off-by: Adrian Reber --- flog/include/compiler.h | 88 +++++++++++++++------------- flog/include/log.h | 10 ++-- flog/include/types.h | 16 +++--- flog/include/uapi/flog.h | 120 ++++++++++++++++++--------------------- flog/include/util.h | 52 +++++++++-------- flog/src/flog.c | 11 ++-- flog/src/main.c | 37 +++++------- 7 files changed, 161 insertions(+), 173 deletions(-) diff --git a/flog/include/compiler.h b/flog/include/compiler.h index 3e56eb0e64..80264ec631 100644 --- a/flog/include/compiler.h +++ b/flog/include/compiler.h @@ -8,64 +8,70 @@ */ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) -#define NORETURN __attribute__((__noreturn__)) -#define __packed __attribute__((__packed__)) -#define __used __attribute__((__used__)) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) -#define __section(S) __attribute__ ((__section__(#S))) +#define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline -# define __always_inline inline __attribute__((always_inline)) +#define __always_inline inline __attribute__((always_inline)) #endif -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) #ifndef always_inline -# define always_inline __always_inline +#define always_inline __always_inline #endif #ifndef noinline -# define noinline __attribute__((noinline)) +#define noinline __attribute__((noinline)) #endif -#define __aligned(x) __attribute__((aligned(x))) +#define __aligned(x) __attribute__((aligned(x))) #ifndef offsetof -# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER) #endif -#define barrier() asm volatile("" ::: "memory") - -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) - -#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) -#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) -#define round_down(x, y) ((x) & ~__round_mask(x, y)) -#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) -#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) - -#define min(x, y) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - (void) (&_min1 == &_min2); \ - _min1 < _min2 ? _min1 : _min2; }) - -#define max(x, y) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - (void) (&_max1 == &_max2); \ - _max1 > _max2 ? _max1 : _max2; }) - -#define is_log2(v) (((v) & ((v) - 1)) == 0) +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) + +#define min(x, y) \ + ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void)(&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ + }) + +#define max(x, y) \ + ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void)(&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; \ + }) + +#define is_log2(v) (((v) & ((v)-1)) == 0) #endif /* __COMPILER_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h index 1a165ea9fb..8aafe44b75 100644 --- a/flog/include/log.h +++ b/flog/include/log.h @@ -3,15 +3,15 @@ #include -#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) #if 1 -# define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) #else -# define pr_debug(fmt, ...) +#define pr_debug(fmt, ...) #endif -#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt, __FILE__, __LINE__, ##__VA_ARGS__) -#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) #endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h index 0e15bfbff5..07c992968b 100644 --- a/flog/include/types.h +++ b/flog/include/types.h @@ -4,13 +4,13 @@ #include #include -typedef uint64_t u64; -typedef int64_t s64; -typedef uint32_t u32; -typedef int32_t s32; -typedef uint16_t u16; -typedef int16_t s16; -typedef uint8_t u8; -typedef int8_t s8; +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; #endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h index 2d879110fc..6061f4556a 100644 --- a/flog/include/uapi/flog.h +++ b/flog/include/uapi/flog.h @@ -13,68 +13,59 @@ /* * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s */ -#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) -#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) -#define FLOG_PP_ARG_N( _0, _1, _2, _3, _4, \ - _5, _6, _7, _8, _9, \ - _10,_11,_12,_13,_14, \ - _15,_16,_17,_18,_19, \ - _20,_21,_22,_23,_24, \ - _25,_26,_27,_28,_29, \ - _30,_31, N, ...) N +#define FLOG_PP_ARG_N(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, \ + _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, N, ...) \ + N -#define FLOG_PP_RSEQ_N() \ - 31, 30, 29, 28, 27, \ - 26, 25, 24, 23, 22, \ - 21, 20, 19, 18, 17, \ - 16, 15, 14, 13, 12, \ - 11, 10, 9, 8, 7, \ - 6, 5, 4, 3, 2, \ - 1, 0 +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, \ + 2, 1, 0 -#define FLOG_GENMASK_0(N, x) 0 -#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) -#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) -#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) -#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) -#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 -#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) -#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) -#define flog_genbit(ord, n, v, ...) \ +#define flog_genbit(ord, n, v, ...) \ _Generic((v), \ \ /* Basic types */ \ @@ -127,21 +118,20 @@ default: -1) typedef struct { - unsigned int magic; - unsigned int size; - unsigned int nargs; - unsigned int mask; - long fmt; - long args[0]; + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; } flog_msg_t; extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); void flog_decode_msg(int fdout, const char *format, ...); extern int flog_decode_all(int fdin, int fdout); -#define flog_encode(fdout, fmt, ...) \ - flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), \ - FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) int flog_map_buf(int fdout); int flog_close(int fdout); diff --git a/flog/include/util.h b/flog/include/util.h index 17a4d77997..7b1edb6885 100644 --- a/flog/include/util.h +++ b/flog/include/util.h @@ -7,31 +7,35 @@ #include "log.h" #include "types.h" -#define __xalloc(op, size, ...) \ - ({ \ - void *___p = op(__VA_ARGS__); \ - ___p; \ +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ }) -#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) -#define xmalloc(size) __xalloc(malloc, size, size) -#define xzalloc(size) __xalloc(calloc, size, 1, size) -#define xrealloc(p, size) __xalloc(realloc, size, p, size) - -#define xfree(p) do { if (p) free(p); } while (0) - -#define xrealloc_safe(pptr, size) \ - ({ \ - int __ret = -ENOMEM; \ - void *new = xrealloc(*pptr, size); \ - if (new) { \ - *pptr = new; \ - __ret = 0; \ - } \ - __ret; \ - }) - -#define memzero_p(p) memset(p, 0, sizeof(*p)) -#define memzero(p, size) memset(p, 0, size) +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) \ + do { \ + if (p) \ + free(p); \ + } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) #endif /* __UTIL_H__ */ diff --git a/flog/src/flog.c b/flog/src/flog.c index 40cce3fedc..8f11a36cbf 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -13,7 +13,7 @@ #define MAGIC 0xABCDABCD -#define BUF_SIZE (1<<20) +#define BUF_SIZE (1 << 20) static char _mbuf[BUF_SIZE]; static char *mbuf = _mbuf; static char *fbuf; @@ -119,11 +119,10 @@ int flog_map_buf(int fdout) } if (!fbuf) - addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, - MAP_FILE | MAP_SHARED, fdout, fsize - 2 * BUF_SIZE); + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, fdout, + fsize - 2 * BUF_SIZE); else - addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, - BUF_SIZE * 2, MREMAP_FIXED, fbuf); + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, BUF_SIZE * 2, MREMAP_FIXED, fbuf); if (addr == MAP_FAILED) { fprintf(stderr, "Unable to map a buffer: %m"); return -1; @@ -160,7 +159,7 @@ int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char if (mbuf != _mbuf && flog_map_buf(fdout)) return -1; - m = (void *) mbuf; + m = (void *)mbuf; m->nargs = nargs; m->mask = mask; diff --git a/flog/src/main.c b/flog/src/main.c index c84e774781..fc5d64ebd2 100644 --- a/flog/src/main.c +++ b/flog/src/main.c @@ -33,12 +33,9 @@ int main(int argc, char *argv[]) static const char short_opts[] = "m:o:di:h"; static struct option long_opts[] = { - { "mode", required_argument, 0, 'm' }, - { "output", required_argument, 0, 'o' }, - { "decode", no_argument, 0, 'd' }, - { "iter", required_argument, 0, 'i' }, - { "help", no_argument, 0, 'h' }, - { }, + { "mode", required_argument, 0, 'm' }, { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, {}, }; while (1) { @@ -68,8 +65,7 @@ int main(int argc, char *argv[]) } else { fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fdout < 0) { - fprintf(stderr, "Can't open %s: %s\n", - optarg, strerror(errno)); + fprintf(stderr, "Can't open %s: %s\n", optarg, strerror(errno)); exit(1); } } @@ -94,42 +90,35 @@ int main(int argc, char *argv[]) if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) return 1; for (i = 0; i < niter; i++) - if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, - (unsigned long)2)) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, + (short)2, (unsigned long)2)) return 1; if (flog_close(fdout)) return 1; - break; - case MODE_DPRINTF: - { + break; + case MODE_DPRINTF: { for (i = 0; i < niter; i++) { - dprintf(fdout, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, (unsigned long)2); } break; } - case MODE_FPRINTF: - { + case MODE_FPRINTF: { FILE *f = fdopen(fdout, "w"); for (i = 0; i < niter; i++) { - fprintf(f, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, + fprintf(f, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, (unsigned long)2); fflush(f); } fclose(f); break; } - case MODE_SPRINTF: - { + case MODE_SPRINTF: { static char buf[4096]; for (i = 0; i < niter; i++) { - sprintf(buf, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, + sprintf(buf, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, (unsigned long)2); } break; From d355c3681a4fc587b7279ba4d7be261b09dcde37 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 6 Apr 2022 17:45:57 -0700 Subject: [PATCH 004/122] flog: typo: mmaped -> mmapped It is mapped, not maped. Same applies for mmap I guess. Found by codespell, except it wants to change it to mapped, which will make it less specific. Signed-off-by: Kir Kolyshkin --- flog/src/flog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flog/src/flog.c b/flog/src/flog.c index 8f11a36cbf..f48b7f127e 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -94,7 +94,7 @@ int flog_map_buf(int fdout) void *addr; /* - * Two buffers are mmaped into memory. A new one is mapped when a first + * Two buffers are mmapped into memory. A new one is mapped when a first * one is completly filled. */ if (fbuf && (mbuf - fbuf < BUF_SIZE)) From 6eafe4ea7a551ddc74ec7e37e6343eebb7297812 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 30 Mar 2022 18:45:16 -0700 Subject: [PATCH 005/122] flog: fix some codespell warnings Brought to you by codespell -w (using codespell v2.1.0). [v2: use "make indent" on the result] Signed-off-by: Kir Kolyshkin --- flog/src/flog.c | 2 +- flog/src/main.c | 51 ++++++++++++++++++++++++------------------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/flog/src/flog.c b/flog/src/flog.c index f48b7f127e..d7660f18d8 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -95,7 +95,7 @@ int flog_map_buf(int fdout) /* * Two buffers are mmapped into memory. A new one is mapped when a first - * one is completly filled. + * one is completely filled. */ if (fbuf && (mbuf - fbuf < BUF_SIZE)) return 0; diff --git a/flog/src/main.c b/flog/src/main.c index fc5d64ebd2..e027917c68 100644 --- a/flog/src/main.c +++ b/flog/src/main.c @@ -129,31 +129,30 @@ int main(int argc, char *argv[]) return 0; usage: - fprintf(stderr, - "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" - "\n" - - "Examples:\n" - "\n" - - " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" - "\n" - " flog -m dprintf -i 100000\n" - "\n" - - " - run 100000 iterations in binary mode without processing (queue messages only)\n" - "\n" - " flog -i 100000\n" - "\n" - - " - run 100000 iterations in binary mode with decoding after\n" - "\n" - " flog -i 100000 -d\n" - "\n" - - " - run 100000 iterations in binary mode with decoding after, writting results into 'out' file\n" - "\n" - " flog -i 100000 -d -o out\n" - "\n"); + fprintf(stderr, "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writing results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); return 1; } From 4795374150886928eb7c0759b27df399a00325e6 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 27 Apr 2022 14:15:37 +0300 Subject: [PATCH 006/122] cr-dump: do not report success to logs if post-dump script failed It can be confusing to see error from post-dump action script and non zero return from criu though at the same time see "Dumping finished successfully" in log. I believe it is logical to consider post-dump action script as a part of "dump" process so fail in it means that the whole dump failed. Signed-off-by: Pavel Tikhomirov --- criu/cr-dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f58701e5c5..60e90baed2 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2049,7 +2049,7 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); close_image_dir(); - if (ret) { + if (ret || post_dump_ret) { pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); From 03539d4d3c4f36b367815f8da856e57f5699c620 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 8 May 2022 21:06:28 +0100 Subject: [PATCH 007/122] ci: Fix unsafe repository error Signed-off-by: Radostin Stoyanov --- .github/workflows/lint.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c1215aeafe..c3886c7070 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -10,9 +10,16 @@ jobs: steps: - name: Install tools run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell + - uses: actions/checkout@v2 + + - name: Set git safe directory + # https://github.com/actions/checkout/issues/760 + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Run make lint run: make lint + - name: Run make indent run: > make indent && From df67400a7b1da90187990f0344e53cc7f253c671 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 8 May 2022 16:04:26 +0700 Subject: [PATCH 008/122] mem: Skip pre-dumping on hugetlb mappings As private hugetlb mappings are not pre-mapped, the content of them is restored in the the restorer which cannot use page_read->read_pages. As a result, we cannot recursively read the content of pre-dumped image in the parent directory and use preadv to read the content from the last dumped image only. Therefore, it may freeze while restoring when the content of mapping is in pre-dumped image in parent directory. We need to skip pre-dumping on hugetlb mappings to resolve the issue. Suggested-by: Alexander Mikhalitsyn Signed-off-by: Bui Quang Minh --- criu/mem.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/criu/mem.c b/criu/mem.c index 136439518f..ab86a1f6d7 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -246,6 +246,12 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; + /* + * We totally ignore MAP_HUGETLB on pre-dump. + * See also generate_vma_iovs() comment. + */ + if ((vma->e->flags & MAP_HUGETLB) && skip_non_trackable) + continue; if (vma->e->prot & PROT_READ) continue; @@ -402,7 +408,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { + /* + * We want to completely ignore these VMA types on the pre-dump: + * 1. VMA_AREA_AIORING because it is not soft-dirty trackable (kernel writes) + * 2. MAP_HUGETLB mappings because they are not premapped and we can't use + * parent images from pre-dump stages. Instead, the content is restored from + * the parasite context using full memory image. + */ + if (vma_entry_is(vma->e, VMA_AREA_AIORING) || vma->e->flags & MAP_HUGETLB) { if (pre_dump) return 0; has_parent = false; From 2de7eea6be5874ecf06a86adb89ef7aa255f4013 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 8 May 2022 16:19:45 +0700 Subject: [PATCH 009/122] Revert "ci: skip new hugetlb maps09/maps10 tests for pre-dump" This reverts commit 37ea8c5fcfef2108800b6d53054f3a7c4f710752. Signed-off-by: Bui Quang Minh --- test/jenkins/criu-dedup.sh | 2 +- test/jenkins/criu-lazy-migration.sh | 2 +- test/jenkins/criu-lazy-pages.sh | 2 +- test/jenkins/criu-pre-dump.sh | 5 ++--- test/jenkins/criu-remote-lazy-pages.sh | 2 +- test/jenkins/criu-snap.sh | 4 ++-- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/test/jenkins/criu-dedup.sh b/test/jenkins/criu-dedup.sh index 842d218bd5..edb1b653d1 100755 --- a/test/jenkins/criu-dedup.sh +++ b/test/jenkins/criu-dedup.sh @@ -4,7 +4,7 @@ set -e source `dirname $0`/criu-lib.sh prep -./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 -x maps09 -x maps10 || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 || fail # Additionally run these tests as they touch a lot of # memory and it makes sense to additionally check it diff --git a/test/jenkins/criu-lazy-migration.sh b/test/jenkins/criu-lazy-migration.sh index b23f31c79d..02a212e0d0 100755 --- a/test/jenkins/criu-lazy-migration.sh +++ b/test/jenkins/criu-lazy-migration.sh @@ -15,7 +15,7 @@ LAZY_MIGRATE_EXCLUDE="-x fifo_loop -x file_locks -x ptrace_sig -x overmount_file --lazy-migrate $LAZY_EXCLUDE $LAZY_MIGRATE_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 -f uns \ diff --git a/test/jenkins/criu-lazy-pages.sh b/test/jenkins/criu-lazy-pages.sh index f629120909..9ef7217391 100755 --- a/test/jenkins/criu-lazy-pages.sh +++ b/test/jenkins/criu-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --lazy-pages $LAZY_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-pre-dump.sh b/test/jenkins/criu-pre-dump.sh index b2972d941f..137f7c23fd 100755 --- a/test/jenkins/criu-pre-dump.sh +++ b/test/jenkins/criu-pre-dump.sh @@ -5,6 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -# FIXME: https://github.com/checkpoint-restore/criu/issues/1868 -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' || fail diff --git a/test/jenkins/criu-remote-lazy-pages.sh b/test/jenkins/criu-remote-lazy-pages.sh index 48787f3f63..1c677e3336 100755 --- a/test/jenkins/criu-remote-lazy-pages.sh +++ b/test/jenkins/criu-remote-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --remote-lazy-pages $LAZY_EXCLUDE -x maps04 || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from "remote" dump with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-snap.sh b/test/jenkins/criu-snap.sh index d8fdf02b3a..b08c57f523 100755 --- a/test/jenkins/criu-snap.sh +++ b/test/jenkins/criu-snap.sh @@ -5,5 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' || fail From a1fb2eca3f22991240c18afacbfd3f45f8a6edf5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 10 May 2022 20:40:53 +0300 Subject: [PATCH 010/122] zdtm: skip zdtm/static/shm-hugetlb when hugetlb is not supported Reported-by: Mr. Jenkins (ppc64le) Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/shm-hugetlb.checkskip | 4 ++++ 1 file changed, 4 insertions(+) create mode 100755 test/zdtm/static/shm-hugetlb.checkskip diff --git a/test/zdtm/static/shm-hugetlb.checkskip b/test/zdtm/static/shm-hugetlb.checkskip new file mode 100755 index 0000000000..df23708156 --- /dev/null +++ b/test/zdtm/static/shm-hugetlb.checkskip @@ -0,0 +1,4 @@ +#!/bin/bash + +# will fail with EOPNOTSUPP +cat /proc/sys/vm/nr_hugepages &> /dev/null From 3c8aa309fc703f5f8b8c6779360b828a99fceff8 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 11 May 2022 19:03:36 +0100 Subject: [PATCH 011/122] crit: Use same version as criu Name collision with an abandoned project named 'crit' in pypi causes pip to show crit (CRiu Image Tool) as outdated. This patch updates crit to use the same version and license as criu. Fixes #1878 Signed-off-by: Radostin Stoyanov --- Makefile | 1 + scripts/crit-setup.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e67bcc5614..f8d44626b5 100644 --- a/Makefile +++ b/Makefile @@ -428,6 +428,7 @@ lint: flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py flake8 --config=scripts/flake8.cfg scripts/criu-ns + flake8 --config=scripts/flake8.cfg scripts/crit-setup.py flake8 --config=scripts/flake8.cfg coredump/ shellcheck --version shellcheck scripts/*.sh diff --git a/scripts/crit-setup.py b/scripts/crit-setup.py index 871e55921e..13df03e3b5 100644 --- a/scripts/crit-setup.py +++ b/scripts/crit-setup.py @@ -1,10 +1,24 @@ +import os from distutils.core import setup +criu_version = "0.0.1" +env = os.environ + +if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: + criu_version = '{}.{}'.format( + env['CRIU_VERSION_MAJOR'], + env['CRIU_VERSION_MINOR'] + ) + + if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: + criu_version += '.' + env['CRIU_VERSION_SUBLEVEL'] + setup(name="crit", - version="0.0.1", + version=criu_version, description="CRiu Image Tool", author="CRIU team", author_email="criu@openvz.org", + license="GPLv2", url="https://github.com/checkpoint-restore/criu", package_dir={'pycriu': 'lib/py'}, packages=["pycriu", "pycriu.images"], From be6d7ca8e61df8bd59f491279a3bd982c14911e4 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 27 Apr 2022 06:51:47 +0300 Subject: [PATCH 012/122] page-pipe: fix limiting a pipe size But actually, 5a92f100b88e probably has to be reverted as a whole. PIPE_MAX_SIZE is the hard limit to avoid PAGE_ALLOC_COSTLY_ORDER allocations in the kernel. But F_SETPIPE_SZ rounds up a requested pipe size to a power-of-2 pages. It means that when we request PIPE_MAX_SIZE that isn't a power-of-2 number, we actually request a pipe size greater than PIPE_MAX_SIZE. Fixes: 5a92f100b88e ("page-pipe: Resize up to PIPE_MAX_SIZE") Signed-off-by: Andrei Vagin --- criu/page-pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 5a7e50bc19..54dc3ccc41 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -56,7 +56,7 @@ static inline int ppb_resize_pipe(struct page_pipe_buf *ppb) if (new_size > PIPE_MAX_SIZE) { if (ppb->pipe_size < PIPE_MAX_SIZE) - ppb->pipe_size = PIPE_MAX_SIZE; + new_size = PIPE_MAX_SIZE; else return 1; } From d84e2e4e94ca933f511644731ba55531eb75b0ce Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 27 Apr 2022 06:59:02 +0300 Subject: [PATCH 013/122] page-xfer: use negative values for error codes Signed-off-by: Andrei Vagin --- criu/page-xfer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 2543a462a9..6599e29cf4 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -736,7 +736,7 @@ static long fill_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufve continue; } else if (errno == ESRCH) { pr_debug("Target process PID:%d not found\n", pid); - return ESRCH; + return -ESRCH; } } @@ -798,7 +798,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len); - if (bytes_read == ESRCH) { + if (bytes_read == -ESRCH) { munmap(userbuf, BUFFER_SIZE); return -1; } From 6d879d570d011a281f7c39e224810aca1351f127 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 27 Apr 2022 06:56:37 +0300 Subject: [PATCH 014/122] page-xfer: adjust a buffer to a pipe size Due to side effects of F_SETPIPE_SZ, the actual pipe size can be greater than PIPE_MAX_SIZE. Signed-off-by: Andrei Vagin --- criu/include/page-xfer.h | 2 +- criu/page-xfer.c | 63 +++++++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 1bcd4ff205..36fe670928 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -10,7 +10,7 @@ struct ps_info { extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); /* User buffer for read-mode pre-dump*/ -#define BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) +#define PIPE_MAX_BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) /* * page_xfer -- transfer pages into image file. diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 6599e29cf4..3d29fbf78b 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -777,31 +777,48 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p struct page_pipe_buf *ppb; unsigned int cur_hole = 0, i; unsigned long ret, bytes_read; + unsigned long userbuf_len; struct iovec bufvec; - struct iovec aux_iov[PIPE_MAX_SIZE]; + struct iovec *aux_iov; unsigned long aux_len; + void *userbuf; - char *userbuf = mmap(NULL, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - + userbuf_len = PIPE_MAX_BUFFER_SIZE; + userbuf = mmap(NULL, userbuf_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (userbuf == MAP_FAILED) { pr_perror("Unable to mmap a buffer"); return -1; } + aux_iov = xmalloc(userbuf_len / PAGE_SIZE * sizeof(aux_iov[0])); + if (!aux_iov) + goto err; list_for_each_entry(ppb, &pp->bufs, l) { + if (ppb->pipe_size * PAGE_SIZE > userbuf_len) { + void *addr; + + addr = mremap(userbuf, userbuf_len, ppb->pipe_size * PAGE_SIZE, MREMAP_MAYMOVE); + if (addr == MAP_FAILED) { + pr_perror("Unable to mmap a buffer"); + goto err; + } + userbuf_len = ppb->pipe_size * PAGE_SIZE; + userbuf = addr; + addr = xrealloc(aux_iov, ppb->pipe_size * sizeof(aux_iov[0])); + if (!addr) + goto err; + aux_iov = addr; + } timing_start(TIME_MEMDUMP); aux_len = 0; - bufvec.iov_len = BUFFER_SIZE; + bufvec.iov_len = userbuf_len; bufvec.iov_base = userbuf; bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len); - - if (bytes_read == -ESRCH) { - munmap(userbuf, BUFFER_SIZE); - return -1; - } + if (bytes_read == -ESRCH) + goto err; bufvec.iov_base = userbuf; bufvec.iov_len = bytes_read; @@ -809,8 +826,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p if (ret == -1 || ret != bytes_read) { pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret); - munmap(userbuf, BUFFER_SIZE); - return -1; + goto err; } timing_stop(TIME_MEMDUMP); @@ -822,10 +838,8 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p u32 flags; ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base); - if (ret) { - munmap(userbuf, BUFFER_SIZE); - return ret; - } + if (ret) + goto err; BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; @@ -833,24 +847,25 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p flags = ppb_xfer_flags(xfer, ppb); - if (xfer->write_pagemap(xfer, &iov, flags)) { - munmap(userbuf, BUFFER_SIZE); - return -1; - } + if (xfer->write_pagemap(xfer, &iov, flags)) + goto err; - if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) { - munmap(userbuf, BUFFER_SIZE); - return -1; - } + if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) + goto err; } timing_stop(TIME_MEMWRITE); } - munmap(userbuf, BUFFER_SIZE); + munmap(userbuf, userbuf_len); + xfree(aux_iov); timing_start(TIME_MEMWRITE); return dump_holes(xfer, pp, &cur_hole, NULL); +err: + munmap(userbuf, userbuf_len); + xfree(aux_iov); + return -1; } int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) From a6aae071fe8c8e86a4e18db69e35ad1ca64a431c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 27 Apr 2022 07:02:58 +0300 Subject: [PATCH 015/122] pre-dump: call vmsplice with SPLICE_F_GIFT In this case, vmplice attaches pages without coping them. Signed-off-by: Andrei Vagin --- criu/page-xfer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 3d29fbf78b..2a9f6e2cc2 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -822,7 +822,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p bufvec.iov_base = userbuf; bufvec.iov_len = bytes_read; - ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK); + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK | SPLICE_F_GIFT); if (ret == -1 || ret != bytes_read) { pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret); From 574f39642b313de5282de8ed60970453c6629d6c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Apr 2022 19:09:07 +0300 Subject: [PATCH 016/122] page-xfer: refactoring analyze_iov and fill_userbuf * handle unexpected errors of process_vm_readv * adjust riovs in analyze_iov * call handle_faulty_iov only if process_vm_readv returns EFAULT. Signed-off-by: Andrei Vagin --- criu/page-xfer.c | 67 +++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 2a9f6e2cc2..782d4cafce 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -617,31 +617,18 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p */ unsigned long handle_faulty_iov(int pid, struct iovec *riov, unsigned long faulty_index, struct iovec *bufvec, - struct iovec *aux_iov, unsigned long *aux_len, unsigned long partial_read_bytes) + struct iovec *aux_iov, unsigned long *aux_len) { struct iovec dummy; ssize_t bytes_read; - unsigned long offset = 0; unsigned long final_read_cnt = 0; - /* Handling Case 2*/ - if (riov[faulty_index].iov_len == PAGE_SIZE) { - cnt_sub(CNT_PAGES_WRITTEN, 1); - return 0; - } - /* Handling Case 3-Part 3.2*/ - offset = (partial_read_bytes) ? partial_read_bytes : PAGE_SIZE; - - dummy.iov_base = riov[faulty_index].iov_base + offset; - dummy.iov_len = riov[faulty_index].iov_len - offset; - - if (!partial_read_bytes) - cnt_sub(CNT_PAGES_WRITTEN, 1); + dummy.iov_base = riov[faulty_index].iov_base; + dummy.iov_len = riov[faulty_index].iov_len; while (dummy.iov_len) { bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0); - if (bytes_read == -1) { /* Handling faulty page read in faulty iov */ cnt_sub(CNT_PAGES_WRITTEN, 1); @@ -671,14 +658,12 @@ unsigned long handle_faulty_iov(int pid, struct iovec *riov, unsigned long fault /* * This function will position start pointer to the latest - * successfully read iov in iovec. In case of partial read it - * returns partial_read_bytes, otherwise 0. + * successfully read iov in iovec. */ static unsigned long analyze_iov(ssize_t bytes_read, struct iovec *riov, unsigned long *index, struct iovec *aux_iov, unsigned long *aux_len) { ssize_t processed_bytes = 0; - unsigned long partial_read_bytes = 0; /* correlating iovs with read bytes */ while (processed_bytes < bytes_read) { @@ -692,13 +677,17 @@ static unsigned long analyze_iov(ssize_t bytes_read, struct iovec *riov, unsigne /* handling partially processed faulty iov*/ if (processed_bytes - bytes_read) { + unsigned long partial_read_bytes = 0; + (*index) -= 1; partial_read_bytes = riov[*index].iov_len - (processed_bytes - bytes_read); aux_iov[*aux_len - 1].iov_len = partial_read_bytes; + riov[*index].iov_base += partial_read_bytes; + riov[*index].iov_len -= partial_read_bytes; } - return partial_read_bytes; + return 0; } /* @@ -723,40 +712,36 @@ static long fill_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufve ssize_t bytes_read; unsigned long total_read = 0; unsigned long start = 0; - unsigned long partial_read_bytes = 0; while (start < ppb->nr_segs) { bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start], ppb->nr_segs - start, 0); - if (bytes_read == -1) { + if (errno == ESRCH) { + pr_debug("Target process PID:%d not found\n", pid); + return -ESRCH; + } + if (errno != EFAULT) { + pr_perror("process_vm_readv failed"); + return -1; + } /* Handling Case 1*/ if (riov[start].iov_len == PAGE_SIZE) { cnt_sub(CNT_PAGES_WRITTEN, 1); start += 1; continue; - } else if (errno == ESRCH) { - pr_debug("Target process PID:%d not found\n", pid); - return -ESRCH; } + total_read += handle_faulty_iov(pid, riov, start, bufvec, aux_iov, aux_len); + start += 1; + continue; } - partial_read_bytes = 0; - if (bytes_read > 0) { - partial_read_bytes = analyze_iov(bytes_read, riov, &start, aux_iov, aux_len); + if (analyze_iov(bytes_read, riov, &start, aux_iov, aux_len) < 0) + return -1; bufvec->iov_base += bytes_read; bufvec->iov_len -= bytes_read; total_read += bytes_read; } - - /* - * If all iovs not processed in one go, - * it means some iov in between has failed. - */ - if (start < ppb->nr_segs) - total_read += handle_faulty_iov(pid, riov, start, bufvec, aux_iov, aux_len, partial_read_bytes); - - start += 1; } return total_read; @@ -817,7 +802,13 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p bufvec.iov_base = userbuf; bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len); - if (bytes_read == -ESRCH) + if (bytes_read == -ESRCH) { + timing_stop(TIME_MEMDUMP); + munmap(userbuf, userbuf_len); + xfree(aux_iov); + return 0; + } + if (bytes_read < 0) goto err; bufvec.iov_base = userbuf; From 45641ab26d7bb78706a6215fdef8f9133abf8d10 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Apr 2022 18:51:32 +0300 Subject: [PATCH 017/122] ci: test the read mode of pre-dump Signed-off-by: Andrei Vagin --- scripts/ci/run-ci-tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 81aa072363..8d9de6e55f 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -268,6 +268,7 @@ make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server --dedup +./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --pre-dump-mode read ./test/zdtm.py run -t zdtm/transition/pid_reuse --pre 2 # start time based pid reuse detection ./test/zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2 # pidfd based pid reuse detection From cd0ed7e5491c7c827b887b2d80cf35e87819278b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 15 May 2022 17:08:18 +0100 Subject: [PATCH 018/122] amdgpu/Makefile: Fix include path When building packages for CRIU the source directory might have a name different than 'criu'. Fixes: #1877 Reported-by: @siris Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 84b9f87147..971b93b516 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -2,7 +2,7 @@ PLUGIN_NAME := amdgpu_plugin PLUGIN_SOBJ := amdgpu_plugin.so -PLUGIN_INCLUDE := -iquote../../../criu/include +PLUGIN_INCLUDE := -iquote../../include PLUGIN_INCLUDE += -iquote../../criu/include PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ PLUGIN_INCLUDE += -iquote../../ From 2b3763fe6a39033d4e3d3d80e28911d1737ac23b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 8 May 2022 11:07:25 +0100 Subject: [PATCH 019/122] amdgpu: Set PLUGINDIR to /usr/lib/criu Building the criu packages for Ubuntu/Debian fails with: mkdir: cannot create directory '/var/lib/criu': Permission denied This patch updates PLUGINDIR with the value /usr/lib/criu Fixes: #1877 Signed-off-by: Radostin Stoyanov --- Makefile.install | 2 +- criu/include/plugin.h | 4 +++- plugins/amdgpu/Makefile | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Makefile.install b/Makefile.install index aafb954697..c798637beb 100644 --- a/Makefile.install +++ b/Makefile.install @@ -7,7 +7,7 @@ MANDIR ?= $(PREFIX)/share/man INCLUDEDIR ?= $(PREFIX)/include LIBEXECDIR ?= $(PREFIX)/libexec RUNDIR ?= /run -PLUGINDIR ?= /var/lib/criu +PLUGINDIR ?= $(PREFIX)/lib/criu # # For recent Debian/Ubuntu with multiarch support. diff --git a/criu/include/plugin.h b/criu/include/plugin.h index a1796b6418..0115e6ea01 100644 --- a/criu/include/plugin.h +++ b/criu/include/plugin.h @@ -5,7 +5,9 @@ #include "common/compiler.h" #include "common/list.h" -#define CR_PLUGIN_DEFAULT "/var/lib/criu/" +#ifndef CR_PLUGIN_DEFAULT +#define CR_PLUGIN_DEFAULT "/usr/lib/criu/" +#endif void cr_plugin_fini(int stage, int err); int cr_plugin_init(int stage); diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 971b93b516..367a52c99e 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,7 +15,7 @@ DEPS_NOK := ; include $(__nmk_dir)msg.mk CC := gcc -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) @@ -50,16 +50,16 @@ clean: amdgpu_plugin_clean amdgpu_plugin_test_clean mrproper: clean install: - $(Q) mkdir -p $(PLUGINDIR) ifeq ($(CONFIG_AMDGPU),y) + $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(PLUGINDIR) + $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) endif .PHONY: install uninstall: ifeq ($(CONFIG_AMDGPU),y) $(E) " UNINSTALL" $(PLUGIN_NAME) - $(Q) $(RM) $(PLUGINDIR)/$(PLUGIN_SOBJ) + $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) endif .PHONY: uninstall From 98eda32ae28b3ffcc849380c59ec9535586a1622 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 31 May 2022 11:25:03 +0300 Subject: [PATCH 020/122] github: use git-clang-format instead of make indent This allows us to only detect bad formating in PR changes but not all the CRIU codebase. Signed-off-by: Pavel Tikhomirov --- .github/workflows/lint.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c3886c7070..d32403d052 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,7 +9,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell + run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell git-clang-format - uses: actions/checkout@v2 @@ -22,7 +22,12 @@ jobs: - name: Run make indent run: > - make indent && + if [ -z "${{github.base_ref}}" ]; then + make indent + else + git fetch origin ${{github.base_ref}} && + git clang-format --style file --extensions c,h --quiet origin/${{github.base_ref}} + fi && STATUS=$(git status --porcelain) && if [ ! -z "$STATUS" ]; then echo "FAIL: some files are not correctly formatted."; From 0db600d91cabebd28215ae2da1f7878bcaa5a9c8 Mon Sep 17 00:00:00 2001 From: Ashutosh Mehra Date: Mon, 30 May 2022 15:57:07 -0400 Subject: [PATCH 021/122] Fix the check for mnt namespace in criu-ns criu-ns script incorrectly compares the pidns fd with mntns fd. Also reversed the condition in is_my_namespace function to align it with the function name. Signed-off-by: Ashutosh Mehra --- scripts/criu-ns | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 9fc58b6406..1217c3dcdf 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -153,9 +153,9 @@ def _set_namespace(fd): raise OSError(_errno, errno.errorcode[_errno]) -def is_my_namespace(fd): +def is_my_namespace(fd, ns): """Returns True if fd refers to current namespace""" - return os.stat('/proc/self/ns/pid').st_ino != os.fstat(fd).st_ino + return os.stat('/proc/self/ns/%s' % ns).st_ino == os.fstat(fd).st_ino def set_pidns(tpid, pid_idx): @@ -165,7 +165,7 @@ def set_pidns(tpid, pid_idx): pid namespace. """ ns_fd = os.open('/proc/%s/ns/pid' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "pid"): for line in open('/proc/%s/status' % tpid): if not line.startswith('NSpid:'): continue @@ -190,7 +190,7 @@ def set_mntns(tpid): will be the same in target mntns. """ ns_fd = os.open('/proc/%s/ns/mnt' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "mnt"): root_st = os.stat('/') cwd_st = os.stat('.') cwd_path = os.path.realpath('.') From baa4516e626453eb10336af06b725a07818210b4 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 9 Jun 2022 12:17:06 +0300 Subject: [PATCH 022/122] sk-unix: make add_fake_unix_queuers earier and rework find_queuer_for Before this patch, if we had a unixsk with incomming scm packets (with fds) and with the sender side fd closed, we got an error: Error (criu/sk-unix.c:1125): unix: Can't find sender for 0x1e First part of the problem is that unix_note_scm_rights() expects to see a "queuer" which would send scm packets to the unixsk, and there is no as the sender side is closed. Second part of the problem is that we already have "fake" queuers feature so that it already creates a unix socket pair and leaves other end open for later queuing packets. But function add_fake_unix_queuers() is called after unix_note_scm_rights() thus there is no chance to find queuer at the point of failure. Third part is that when we look for a queuer in find_queuer_for() we actually look for a socket for which we are a queuer and not for the socket which is a queuer for us, which is opposite to the name. For cases where both ends are alive both are queuers for each other so this was not important, but for our closed sender case it breaks. So let's reorder add_fake_unix_queuers() before unix_note_scm_rights() and make find_queuer_for() actually do what it's name implies. This situation is started to reproduce on Virtuozzo start/stop tests with the unixsk belonging to systemd, we suppose that this state where the sender fd side is closed happens rarely only on systemd start/stop, so we don't see it in regular suspend resume of long-living containers. Signed-off-by: Pavel Tikhomirov --- criu/cr-restore.c | 8 ++++---- criu/sk-unix.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9853c05854..398faf048d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -351,6 +351,10 @@ static int root_prepare_shared(void) if (ret) goto err; + ret = add_fake_unix_queuers(); + if (ret) + goto err; + /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. @@ -367,10 +371,6 @@ static int root_prepare_shared(void) if (ret) goto err; - ret = add_fake_unix_queuers(); - if (ret) - goto err; - show_saved_files(); err: return ret; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index c6021bc1f5..47e1b2962a 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -1021,8 +1021,8 @@ static struct unix_sk_info *find_queuer_for(int id) struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { - if (ui->queuer && ui->queuer->ue->id == id) - return ui; + if (ui->queuer && ui->ue->id == id) + return ui->queuer; } return NULL; From 8a147da2d2b343f04f1b0a3a29dc94a1c95731d1 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 9 Jun 2022 17:48:37 +0300 Subject: [PATCH 023/122] zdtm/scm: add scm09 test with closed sender fd Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 2 ++ test/zdtm/static/scm00.c | 3 +++ test/zdtm/static/scm09.c | 1 + 3 files changed, 6 insertions(+) create mode 120000 test/zdtm/static/scm09.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 9dc02d4a58..aafc63f045 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -199,6 +199,7 @@ TST_NOFILE := \ scm04 \ scm05 \ scm06 \ + scm09 \ aio00 \ aio01 \ fd \ @@ -591,6 +592,7 @@ vdso01: LDLIBS += -lrt scm01: CFLAGS += -DKEEP_SENT_FD scm02: CFLAGS += -DSEND_BOTH scm04: CFLAGS += -DSEPARATE +scm09: CFLAGS += -DCLOSE_SENDER_FD mntns_link_remap: CFLAGS += -DZDTM_LINK_REMAP mntns_shared_bind02: CFLAGS += -DSHARED_BIND02 mntns_root_bind02: CFLAGS += -DROOT_BIND02 diff --git a/test/zdtm/static/scm00.c b/test/zdtm/static/scm00.c index d669755828..670e6fd6a4 100644 --- a/test/zdtm/static/scm00.c +++ b/test/zdtm/static/scm00.c @@ -105,6 +105,9 @@ int main(int argc, char **argv) p[1] = p[0]; p[0] = -1; #endif +#endif +#ifdef CLOSE_SENDER_FD + close(sk[0]); #endif test_daemon(); diff --git a/test/zdtm/static/scm09.c b/test/zdtm/static/scm09.c new file mode 120000 index 0000000000..4cab0edd20 --- /dev/null +++ b/test/zdtm/static/scm09.c @@ -0,0 +1 @@ +scm00.c \ No newline at end of file From ac272451348c8b59149902e99ba343633d0626fb Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 18 May 2022 18:25:43 +0300 Subject: [PATCH 024/122] mount-v2: split out restore_one_sharing helper This helper restores master_id and shared_id of first mount in the sharing group. It first copies sharing from either external source or internal parent sharing group and makes master_id from shared_id. Next it creates new shared_id when needed. All other mounts except first are just copied from the first one. Signed-off-by: Pavel Tikhomirov --- criu/mount-v2.c | 60 ++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 623016d428..1d188114f7 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -925,27 +925,25 @@ static int move_mount_set_group(int src_id, char *source, int dst_id) return 0; } -static int restore_one_sharing_group(struct sharing_group *sg) +static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) { - struct mount_info *first, *other; - char first_path[PATH_MAX]; - int first_fd; + char target_path[PATH_MAX]; + int target_fd; - first = get_first_mount(sg); - first_fd = fdstore_get(first->mnt_fd_id); - BUG_ON(first_fd < 0); - snprintf(first_path, sizeof(first_path), "/proc/self/fd/%d", first_fd); + target_fd = fdstore_get(target->mnt_fd_id); + BUG_ON(target_fd < 0); + snprintf(target_path, sizeof(target_path), "/proc/self/fd/%d", target_fd); - /* Restore first's master_id from shared_id of the source */ + /* Restore target's master_id from shared_id of the source */ if (sg->master_id) { if (sg->parent) { - struct mount_info *p; + struct mount_info *first; /* Get shared_id from parent sharing group */ - p = get_first_mount(sg->parent); - if (move_mount_set_group(p->mnt_fd_id, NULL, first->mnt_fd_id)) { - pr_err("Failed to copy sharing from %d to %d\n", p->mnt_id, first->mnt_id); - close(first_fd); + first = get_first_mount(sg->parent); + if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); + close(target_fd); return -1; } } else { @@ -956,30 +954,42 @@ static int restore_one_sharing_group(struct sharing_group *sg) * or non-shared slave). If source is a private mount * we would fail. */ - if (move_mount_set_group(-1, sg->source, first->mnt_fd_id)) { - pr_err("Failed to copy sharing from source %s to %d\n", sg->source, first->mnt_id); - close(first_fd); + if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { + pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); + close(target_fd); return -1; } } /* Convert shared_id to master_id */ - if (mount(NULL, first_path, NULL, MS_SLAVE, NULL)) { - pr_perror("Failed to make mount %d slave", first->mnt_id); - close(first_fd); + if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { + pr_perror("Failed to make mount %d slave", target->mnt_id); + close(target_fd); return -1; } } - /* Restore first's shared_id */ + /* Restore target's shared_id */ if (sg->shared_id) { - if (mount(NULL, first_path, NULL, MS_SHARED, NULL)) { - pr_perror("Failed to make mount %d shared", first->mnt_id); - close(first_fd); + if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { + pr_perror("Failed to make mount %d shared", target->mnt_id); + close(target_fd); return -1; } } - close(first_fd); + close(target_fd); + + return 0; +} + +static int restore_one_sharing_group(struct sharing_group *sg) +{ + struct mount_info *first, *other; + + first = get_first_mount(sg); + + if (restore_one_sharing(sg, first)) + return -1; /* Restore sharing for other mounts from the sharing group */ list_for_each_entry(other, &sg->mnt_list, mnt_sharing) { From 7e376186512b7b83bb5a73d6e9ce75e5dc28c835 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 18 May 2022 18:25:43 +0300 Subject: [PATCH 025/122] mount-v2: workaround for multiple external bindmounts with no common root It's a problem when while restoring sharing group we need to copy sharing between two mounts with non-intersecting roots, because kernel does not allow it. We have a case https://github.com/opencontainers/runc/pull/3442, where runc adds different devtmpfs file-bindmounts to container and there is no fsroot mount in container for this devtmpfs, thus mount-v2 faces the above problem. Luckily for the case of external mounts which are in one sharing group and which have non-intersecting roots, these mounts likely only have external master with no sharing, so we can just copy sharing from external source and make it slave as a workaround. https://github.com/checkpoint-restore/criu/issues/1886 Signed-off-by: Pavel Tikhomirov --- criu/mount-v2.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 1d188114f7..5d53e9a226 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -996,9 +996,35 @@ static int restore_one_sharing_group(struct sharing_group *sg) if (other == first) continue; - if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { - pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); - return -1; + if (is_sub_path(other->root, first->root)) { + if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); + return -1; + } + } else { + /* + * Case where mounts of this sharing group don't have common root. + * For instance we can create two sub-directories .a and .b in some + * shared mount, bindmount them separately somethere and umount the + * original mount. Now we have both bindmounts shared between each + * other. Kernel only allows to copy sharing between mounts when + * source root contains destination root, which is not true for + * these two, so we can't just copy from first to other. + * + * For external sharing (!sg->parent) with only master_id (shared_id + * == 0) we can workaround this by copying from their external source + * instead (same as we did for a first mount). + * + * This is a w/a runc usecase, see https://github.com/opencontainers/runc/pull/3442 + */ + if (!sg->parent && !sg->shared_id) { + if (restore_one_sharing(sg, other)) + return -1; + } else { + pr_err("Can't copy sharing from %d[%s] to %d[%s]\n", first->mnt_id, first->root, + other->mnt_id, other->root); + return -1; + } } } From 58a2d982db258afe7d727a0477b4416753808a6c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 20 May 2022 12:11:49 +0300 Subject: [PATCH 026/122] zdtm: test multiple ext bindmounts with no common root and same master Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ext_multiple.c | 118 +++++++++++++++++++++++++ test/zdtm/static/mnt_ext_multiple.desc | 5 ++ 3 files changed, 124 insertions(+) create mode 100644 test/zdtm/static/mnt_ext_multiple.c create mode 100644 test/zdtm/static/mnt_ext_multiple.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index aafc63f045..5b88dfa1a6 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -407,6 +407,7 @@ TST_DIR = \ mntns_pivot_root \ mntns_pivot_root_ro \ mnt_ext_sharing \ + mnt_ext_multiple \ mount_complex_sharing \ mnt_tracefs \ mntns_deleted \ diff --git a/test/zdtm/static/mnt_ext_multiple.c b/test/zdtm/static/mnt_ext_multiple.c new file mode 100644 index 0000000000..7014927ac3 --- /dev/null +++ b/test/zdtm/static/mnt_ext_multiple.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check multiple non-common root external mounts with same external master"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname = "mnt_ext_multiple.test"; +char *source = "zdtm_ext_multiple"; +char *ext_source = "zdtm_ext_multiple.ext"; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char *root, testdir[PATH_MAX]; + char dst_a[PATH_MAX], dst_b[PATH_MAX]; + char src[PATH_MAX], src_a[PATH_MAX], src_b[PATH_MAX]; + char nsdst_a[PATH_MAX], nsdst_b[PATH_MAX]; + char *tmp = "/tmp/zdtm_ext_multiple.tmp"; + char *zdtm_newns = getenv("ZDTM_NEWNS"); + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare directories in test root */ + sprintf(testdir, "%s/%s", root, dirname); + mkdir(testdir, 0755); + sprintf(dst_a, "%s/%s/dst_a", root, dirname); + mkdir(dst_a, 0755); + sprintf(dst_b, "%s/%s/dst_b", root, dirname); + mkdir(dst_b, 0755); + + /* Prepare directories in criu root */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + sprintf(src, "%s/src", tmp); + mkdir(src, 0755); + + /* Create a shared mount in criu mntns */ + if (mount(ext_source, src, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, src, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + if (mount(NULL, src, NULL, MS_SHARED, NULL)) { + pr_perror("make shared"); + return 1; + } + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + /* + * Populate to the tests root subdirectories of the src mount + */ + sprintf(src_a, "%s/src/a", tmp); + mkdir(src_a, 0755); + if (mount(src_a, dst_a, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + sprintf(src_b, "%s/src/b", tmp); + mkdir(src_b, 0755); + if (mount(src_b, dst_b, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + +test: + test_init(argc, argv); + + /* Make "external" mounts to have external master */ + sprintf(nsdst_a, "/%s/dst_a", dirname); + if (mount(NULL, nsdst_a, NULL, MS_SLAVE, NULL)) { + pr_perror("make slave"); + return 1; + } + sprintf(nsdst_b, "/%s/dst_b", dirname); + if (mount(NULL, nsdst_b, NULL, MS_SLAVE, NULL)) { + pr_perror("make slave"); + return 1; + } + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/test/zdtm/static/mnt_ext_multiple.desc b/test/zdtm/static/mnt_ext_multiple.desc new file mode 100644 index 0000000000..fd413ed15c --- /dev/null +++ b/test/zdtm/static/mnt_ext_multiple.desc @@ -0,0 +1,5 @@ +{ 'dopts': '--external mnt[/mnt_ext_multiple.test/dst_a]:MNT_A --external mnt[/mnt_ext_multiple.test/dst_b]:MNT_B', + 'feature': 'mnt_id move_mount_set_group', + 'flavor': 'ns uns', + 'flags': 'suid', + 'ropts': '--external mnt[MNT_A]:/tmp/zdtm_ext_multiple.tmp/src/a --external mnt[MNT_B]:/tmp/zdtm_ext_multiple.tmp/src/b --no-mntns-compat-mode'} From edb3b8f3d3f8cf139f53256ff15d3533061fd043 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 12 May 2022 21:44:00 +0100 Subject: [PATCH 027/122] amdgpu: Add gitignore Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 plugins/amdgpu/.gitignore diff --git a/plugins/amdgpu/.gitignore b/plugins/amdgpu/.gitignore new file mode 100644 index 0000000000..4e5c8f58e1 --- /dev/null +++ b/plugins/amdgpu/.gitignore @@ -0,0 +1,3 @@ +*.pb-c.c +*.pb-c.h +test_topology_remap From fa6efbfe852e9e85708bc2b785afd9be42af95a3 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 12 May 2022 22:40:54 +0700 Subject: [PATCH 028/122] hugetlb: don't dump anonymous private hugetlb mapping using memfd approach Currently, the content of anonymous private hugetlb mapping is dumped in 2 different images: memfd approach and normal private mapping dumping. In memfd approach, we dump the content of the backing pseudo file (/anon_hugepage). This is incorrect and redundant since the mapping is private, the content of backing file may differ from the content of the mapping. With this commit, we remove the redundant memfd approach dump and only do the normal private mapping dump on anonymous hugetlb mapping. Run zdtm.py run -f h --keep-img always -t zdtm/static/maps09, du -h in the dumped image directory Before this commit 13M test/dump/zdtm/static/maps09/55/1 After this commit 8.5M test/dump/zdtm/static/maps09/55/1 The reduction in size is approximately 4MB which is the size of anonymous private hugetlb mapping in the test. Signed-off-by: Bui Quang Minh --- criu/hugetlb.c | 13 +++++++++++++ criu/include/hugetlb.h | 6 ++++++ criu/proc_parse.c | 7 +++---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/criu/hugetlb.c b/criu/hugetlb.c index aa98662d81..866c4050fd 100644 --- a/criu/hugetlb.c +++ b/criu/hugetlb.c @@ -35,6 +35,19 @@ int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag) return 0; } +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma) +{ + /* + * Dump the hugetlb backed mapping using memfd_hugetlb when it is not + * anonymous private mapping. + */ + if (kdat.has_memfd_hugetlb && is_hugetlb_dev(dev, hugetlb_size_flag) && + !((vma->e->flags & MAP_PRIVATE) && !strncmp(file_path, ANON_HUGEPAGE_PREFIX, ANON_HUGEPAGE_PREFIX_LEN))) + return 1; + + return 0; +} + unsigned long get_size_from_hugetlb_flag(int flag) { int i; diff --git a/criu/include/hugetlb.h b/criu/include/hugetlb.h index c0e83652b7..9aee5bed35 100644 --- a/criu/include/hugetlb.h +++ b/criu/include/hugetlb.h @@ -4,6 +4,11 @@ #include #include +#include "vma.h" + +#define ANON_HUGEPAGE_PREFIX "/anon_hugepage" +#define ANON_HUGEPAGE_PREFIX_LEN (sizeof(ANON_HUGEPAGE_PREFIX) - 1) + enum hugepage_size { HUGETLB_16KB, HUGETLB_64KB, @@ -46,6 +51,7 @@ struct htlb_info { extern struct htlb_info hugetlb_info[HUGETLB_MAX]; int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag); +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma); unsigned long get_size_from_hugetlb_flag(int flag); #ifndef MFD_HUGETLB diff --git a/criu/proc_parse.c b/criu/proc_parse.c index b3badb6e41..6b41a81db0 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -620,17 +620,16 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat pr_info("path: %s\n", file_path); vma_area->e->status |= VMA_AREA_SYSVIPC; } else { - /* Dump shmem dev, hugetlb dev (private and share) mappings the same way as memfd - * when possible. + /* We dump memfd backed mapping, both normal and hugepage anonymous share + * mapping using memfd approach when possible. */ if (is_memfd(st_buf->st_dev) || is_anon_shmem_map(st_buf->st_dev) || - (kdat.has_memfd_hugetlb && is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag))) { + can_dump_with_memfd_hugetlb(st_buf->st_dev, &hugetlb_flag, file_path, vma_area)) { vma_area->e->status |= VMA_AREA_MEMFD; vma_area->e->flags |= hugetlb_flag; if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } else if (is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag)) { - /* hugetlb mapping but memfd does not support HUGETLB */ vma_area->e->flags |= hugetlb_flag; vma_area->e->flags |= MAP_ANONYMOUS; From dc160c0d89587ae5ca793659f08619702fcad49d Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 17 Jun 2022 14:38:28 +0300 Subject: [PATCH 029/122] util/mount-v2: fix resolve_mountpoint() to always return freeable pointer Else we have a Segmentation fault in __move_mount_set_group() on xfree(source_mp) if resolve_mountpoint() returned statically allocated path. Signed-off-by: Pavel Tikhomirov --- criu/util.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/criu/util.c b/criu/util.c index 40b12bace8..5f69465b44 100644 --- a/criu/util.c +++ b/criu/util.c @@ -2021,6 +2021,10 @@ char *resolve_mountpoint(char *path) char *mp_path, *free_path; bool is_mountpoint; + /* + * The dirname() function may modify the contents of given path, + * so we need a strdup here to preserve path. + */ mp_path = free_path = xstrdup(path); if (!mp_path) return NULL; @@ -2031,7 +2035,7 @@ char *resolve_mountpoint(char *path) * by openat2 RESOLVE_NO_XDEV, let's just assume they are. */ if (is_same_path(mp_path, "/")) - return mp_path; + goto out; if (path_is_mountpoint(mp_path, &is_mountpoint) == -1) { xfree(free_path); @@ -2039,7 +2043,7 @@ char *resolve_mountpoint(char *path) } if (is_mountpoint) - return mp_path; + goto out; /* Try parent directory */ mp_path = dirname(mp_path); @@ -2048,4 +2052,14 @@ char *resolve_mountpoint(char *path) /* never get here */ xfree(free_path); return NULL; +out: + /* + * The dirname() function may or may not return statically allocated + * strings, so here mp_path can be either dynamically allocated or + * statically allocated. Let's strdup to make the return pointer + * always freeable. + */ + mp_path = xstrdup(mp_path); + xfree(free_path); + return mp_path; } From f82b71c9cf940849a9647c995282efed179a1bd9 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 20 Jun 2022 12:54:29 +0300 Subject: [PATCH 030/122] zdtm: add mnt_root_ext test This test has one external mount [criumntns] /zdtm_root_ext.tmp -> [testmntns] /mnt_root_ext.test, and it specifically gives '--external mnt[MNT]:.zdtm_root_ext.tmp' option on restore without '/' to make dirname on it return static '.' path (see glibc dirname() code) and reproduce a segfault in resolve_mountpoint(). Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_root_ext.c | 79 ++++++++++++++++++++++++++++++ test/zdtm/static/mnt_root_ext.desc | 5 ++ test/zdtm/static/mnt_root_ext.hook | 5 ++ 4 files changed, 90 insertions(+) create mode 100644 test/zdtm/static/mnt_root_ext.c create mode 100644 test/zdtm/static/mnt_root_ext.desc create mode 100755 test/zdtm/static/mnt_root_ext.hook diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 5b88dfa1a6..a3c1ccf4bf 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -403,6 +403,7 @@ TST_DIR = \ mnt_ext_master \ mnt_ext_dev \ mnt_ext_root \ + mnt_root_ext \ mnt_ext_collision \ mntns_pivot_root \ mntns_pivot_root_ro \ diff --git a/test/zdtm/static/mnt_root_ext.c b/test/zdtm/static/mnt_root_ext.c new file mode 100644 index 0000000000..6a2eb068c6 --- /dev/null +++ b/test/zdtm/static/mnt_root_ext.c @@ -0,0 +1,79 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check external mount from host's rootfs"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname = "mnt_root_ext.test"; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char *root, testdir[PATH_MAX], nstestdir[PATH_MAX]; + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char tmp[] = "/.zdtm_root_ext.tmp"; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare directories in test root */ + sprintf(testdir, "%s/%s", root, dirname); + mkdir(testdir, 0755); + + /* Prepare directories in criu root */ + mkdir(tmp, 0755); + + /* Make criu's mntns root mount shared */ + if (mount(NULL, "/", NULL, MS_SHARED, NULL)) { + pr_perror("make shared"); + return 1; + } + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + /* + * Populate to the tests root host's rootfs subdir + */ + if (mount(tmp, testdir, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + /* + * Make "external" mount to be slave + */ + sprintf(nstestdir, "/%s", dirname); + if (mount(NULL, nstestdir, NULL, MS_SLAVE, NULL)) { + pr_perror("make slave"); + return 1; + } + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/test/zdtm/static/mnt_root_ext.desc b/test/zdtm/static/mnt_root_ext.desc new file mode 100644 index 0000000000..fee7efbae2 --- /dev/null +++ b/test/zdtm/static/mnt_root_ext.desc @@ -0,0 +1,5 @@ +{ 'dopts': '--external mnt[/mnt_root_ext.test]:MNT', + 'feature': 'mnt_id move_mount_set_group', + 'flavor': 'ns uns', + 'flags': 'suid', + 'ropts': '--external mnt[MNT]:.zdtm_root_ext.tmp --no-mntns-compat-mode'} diff --git a/test/zdtm/static/mnt_root_ext.hook b/test/zdtm/static/mnt_root_ext.hook new file mode 100755 index 0000000000..a5286f208c --- /dev/null +++ b/test/zdtm/static/mnt_root_ext.hook @@ -0,0 +1,5 @@ +#!/bin/bash + +[ "$1" == "--clean" ] || exit 0 + +rmdir /.zdtm_root_ext.tmp From 28581f2b23d0ad9f7c403fcded83adf7193e63f5 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 21 Jun 2022 12:17:52 +0300 Subject: [PATCH 031/122] config: fail on --track-mem option if dirty tracking is not available Else we trigger BUG in task_reset_dirty_track(): Error (criu/mem.c:45): BUG at criu/mem.c:45 The check in kerndat_get_dirty_track() does not work right. https://github.com/checkpoint-restore/criu/issues/1917 Reported-by: @mrc1119 Signed-off-by: Pavel Tikhomirov --- criu/config.c | 5 +++++ criu/kerndat.c | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/criu/config.c b/criu/config.c index 14a11f9c3e..4023d807ca 100644 --- a/criu/config.c +++ b/criu/config.c @@ -1115,6 +1115,11 @@ int check_options(void) } } + if (opts.track_mem && !kdat.has_dirty_track) { + pr_err("Tracking memory is not available. Consider omitting --track-mem option.\n"); + return 1; + } + if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/kerndat.c b/criu/kerndat.c index b8b6bc95d7..bc5dccab18 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -420,10 +420,6 @@ static int kerndat_get_dirty_track(void) } else { no_dt: pr_info("Dirty tracking support is OFF\n"); - if (opts.track_mem) { - pr_err("Tracking memory is not available\n"); - return -1; - } } return 0; From 029ca223767f41ab8aeee6441c04aba3f3e5765b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Jun 2022 09:08:21 +0100 Subject: [PATCH 032/122] ci: Fix code indent This patch contains auto-generated changes from `make indent` Signed-off-by: Radostin Stoyanov --- criu/arch/x86/cpu.c | 2 +- criu/arch/x86/crtools.c | 2 +- criu/cr-check.c | 2 +- criu/unittest/mock.c | 6 ++---- plugins/amdgpu/amdgpu_plugin.c | 4 ++-- test/zdtm/static/inotify_system.c | 2 +- 6 files changed, 8 insertions(+), 10 deletions(-) diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index b3a7ca6365..dfa31569fa 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -78,7 +78,7 @@ int cpu_dump_cpuinfo(void) cpu_info.n_x86_entry = 1; cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? CPUINFO_X86_ENTRY__VENDOR__INTEL : - CPUINFO_X86_ENTRY__VENDOR__AMD; + CPUINFO_X86_ENTRY__VENDOR__AMD; cpu_x86_info.cpu_family = rt_cpu_info.x86_family; cpu_x86_info.model = rt_cpu_info.x86_model; diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index f177b9e7b8..d10e51e480 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -409,7 +409,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { fpu_state_t *fpu_state = core_is_compat(core) ? &sigframe->compat.fpu_state : &sigframe->native.fpu_state; struct xsave_struct *x = core_is_compat(core) ? (void *)&fpu_state->fpu_state_ia32.xsave : - (void *)&fpu_state->fpu_state_64.xsave; + (void *)&fpu_state->fpu_state_64.xsave; /* * If no FPU information provided -- we're restoring diff --git a/criu/cr-check.c b/criu/cr-check.c index a172806f5c..f589a91da1 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1291,7 +1291,7 @@ static int check_net_diag_raw(void) { check_sock_diag(); return (socket_test_collect_bit(AF_INET, IPPROTO_RAW) && socket_test_collect_bit(AF_INET6, IPPROTO_RAW)) ? 0 : - -1; + -1; } static int check_pidfd_store(void) diff --git a/criu/unittest/mock.c b/criu/unittest/mock.c index 0151873dc1..e517720e42 100644 --- a/criu/unittest/mock.c +++ b/criu/unittest/mock.c @@ -103,8 +103,7 @@ void set_cr_errno(int new_err) { } -struct ns_desc { -}; +struct ns_desc {}; struct ns_desc user_ns_desc; int switch_ns(int pid, struct ns_desc *nd, int *rst) { @@ -118,8 +117,7 @@ int run_scripts(enum script_actions act) } typedef struct VmaEntry VmaEntry; -struct VmaEntry { -}; +struct VmaEntry {}; void vma_entry__init(VmaEntry *message) { } diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index e48c8988b2..0a55e34a2b 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -873,7 +873,7 @@ void *dump_bo_contents(void *_thread_data) } max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; + SDMA_LINEAR_COPY_MAX_SIZE - 1; for (i = 0; i < thread_data->num_of_bos; i++) { if (bo_buckets[i].gpu_id == thread_data->gpu_id && @@ -967,7 +967,7 @@ void *restore_bo_contents(void *_thread_data) } max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; + SDMA_LINEAR_COPY_MAX_SIZE - 1; snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, false, &image_size); diff --git a/test/zdtm/static/inotify_system.c b/test/zdtm/static/inotify_system.c index 141476415a..079d4b1613 100644 --- a/test/zdtm/static/inotify_system.c +++ b/test/zdtm/static/inotify_system.c @@ -57,7 +57,7 @@ const char *inot_dir = "./inotify.no_del"; (MASK == IN_UNMOUNT) ? "IN_UNMOUNT" : \ (MASK == IN_Q_OVERFLOW) ? "IN_Q_OVERFLOW" : \ (MASK == IN_IGNORED) ? "IN_IGNORED" : \ - "UNKNOWN" + "UNKNOWN" #include #include From 7968e71db86f92b09a207bb580924265892fe8ec Mon Sep 17 00:00:00 2001 From: Yuriy Vasiliev Date: Thu, 20 Jan 2022 17:13:59 +0100 Subject: [PATCH 033/122] infect: add SIGTSTP support Add SIGTSTP signal dump and restore. Add a corresponding field in the image, save it only if a task is in the stopped state. Restore task state by sending desired stop signal if it is present in the image. Fallback to SIGSTOP if it's absent. Signed-off-by: Yuriy Vasiliev --- Documentation/compel.txt | 5 ++- compel/include/uapi/infect.h | 3 ++ compel/src/lib/infect.c | 87 +++++++++++++++++++++++++++++------- criu/cr-dump.c | 5 +++ criu/cr-restore.c | 11 ++++- criu/include/pid.h | 4 ++ criu/proc_parse.c | 15 ++++++- criu/pstree.c | 1 + criu/seize.c | 8 +++- images/core.proto | 2 + 10 files changed, 121 insertions(+), 20 deletions(-) diff --git a/Documentation/compel.txt b/Documentation/compel.txt index a44ca22c66..506228f592 100644 --- a/Documentation/compel.txt +++ b/Documentation/compel.txt @@ -97,7 +97,10 @@ Following steps are performed to infect the victim process: - execute system call: *int compel_syscall(ctl, int syscall_nr, long *ret, int arg ...);* - infect victim: *int compel_infect(ctl, nr_thread, size_of_args_area);* - cure the victim: *int compel_cure(ctl);* //ctl pointer is freed by this call - - Resume victim: *int compel_resume_task(pid, orig_state, state);* + - Resume victim: *int compel_resume_task(pid, orig_state, state)* or + *int compel_resume_task_sig(pid, orig_state, state, stop_signo).* + //compel_resume_task_sig() could be used in case when victim is in stopped state. + stop_signo could be read by calling compel_parse_stop_signo(). *ctl* must be configured with blob information by calling *PREFIX_setup_c_header()*, with ctl as its argument. *PREFIX* is the argument given to *-p* when calling hgen, else it is deduced from file name. diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 3040a67a78..7073f343f2 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -18,6 +18,7 @@ extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; unsigned long long shdpnd; + unsigned long long sigblk; char state; int vpid; int ppid; @@ -30,7 +31,9 @@ extern int __must_check compel_wait_task(int pid, int ppid, struct seize_task_status *st, void *data); extern int __must_check compel_stop_task(int pid); +extern int __must_check compel_parse_stop_signo(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); +extern int compel_resume_task_sig(pid_t pid, int orig_state, int state, int stop_signo); struct parasite_ctl; struct parasite_thread_ctl; diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index c78c02a6a0..b99f23b360 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -92,6 +92,12 @@ static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(aux, "SigBlk:", 7)) { + if (sscanf(aux + 7, "%llx", &ss->sigblk) != 1) + goto err_parse; + + continue; + } } fclose(f); @@ -186,6 +192,29 @@ static int skip_sigstop(int pid, int nr_signals) return 0; } +#define SIG_MASK(sig) (1ULL << ((sig)-1)) + +#define SIG_IN_MASK(sig, mask) ((sig) > 0 && (sig) <= SIGMAX && (SIG_MASK(sig) & (mask))) + +#define SUPPORTED_STOP_MASK ((1ULL << (SIGSTOP - 1)) | (1ULL << (SIGTSTP - 1))) + +static inline int sig_stop(int sig) +{ + return SIG_IN_MASK(sig, SUPPORTED_STOP_MASK); +} + +int compel_parse_stop_signo(int pid) +{ + siginfo_t si; + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si) < 0) { + pr_perror("SEIZE %d: can't parse stopped siginfo", pid); + return -1; + } + + return si.si_signo; +} + /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace @@ -198,7 +227,7 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ void *data) { siginfo_t si; - int status, nr_sigstop; + int status, nr_stopsig; int ret = 0, ret2, wait_errno = 0; /* @@ -291,17 +320,32 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ goto err; } - nr_sigstop = 0; - if (ss->sigpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (ss->shdpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (si.si_signo == SIGSTOP) - nr_sigstop++; + nr_stopsig = 0; + if (SIG_IN_MASK(SIGSTOP, ss->sigpnd)) + nr_stopsig++; + if (SIG_IN_MASK(SIGSTOP, ss->shdpnd)) + nr_stopsig++; + + if (SIG_IN_MASK(SIGTSTP, ss->sigpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + if (SIG_IN_MASK(SIGTSTP, ss->shdpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + + if (sig_stop(si.si_signo)) + nr_stopsig++; - if (nr_sigstop) { - if (skip_sigstop(pid, nr_sigstop)) - goto err_stop; + if (nr_stopsig) { + if (skip_sigstop(pid, nr_stopsig)) { + /* + * Make sure that the task is stopped by a supported stop signal and + * send it again to restore task state before criu intervention. + */ + if (sig_stop(si.si_signo)) + kill(pid, si.si_signo); + else + kill(pid, SIGSTOP); + goto err; + } return COMPEL_TASK_STOPPED; } @@ -313,8 +357,6 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ goto err; } -err_stop: - kill(pid, SIGSTOP); err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); @@ -322,6 +364,11 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ } int compel_resume_task(pid_t pid, int orig_st, int st) +{ + return compel_resume_task_sig(pid, orig_st, st, SIGSTOP); +} + +int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) { int ret = 0; @@ -345,8 +392,18 @@ int compel_resume_task(pid_t pid, int orig_st, int st) * task with STOP in queue that would get lost after * detach, so stop it again. */ - if (orig_st == COMPEL_TASK_STOPPED) - kill(pid, SIGSTOP); + if (orig_st == COMPEL_TASK_STOPPED) { + /* + * Check that stop_signo contain supported stop signal. + * If it isn't, then send SIGSTOP. It makes sense in the case + * when we get COMPEL_TASK_STOPPED from old image, + * where stop_signo was not yet supported. + */ + if (sig_stop(stop_signo)) + kill(pid, stop_signo); + else + kill(pid, SIGSTOP); + } } else { pr_err("Unknown final state %d\n", st); ret = -1; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 60e90baed2..e60da88ed7 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -781,6 +781,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[0]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + if (core->tc->task_state == TASK_STOPPED) { + core->tc->has_stop_signo = true; + core->tc->stop_signo = item->pid->stop_signo; + } + ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 398faf048d..279246c190 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1350,6 +1350,9 @@ static inline int fork_with_pid(struct pstree_item *item) item->pid->state = ca.core->tc->task_state; rsti(item)->cg_set = ca.core->tc->cg_set; + if (ca.core->tc->has_stop_signo) + item->pid->stop_signo = ca.core->tc->stop_signo; + if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); return -1; @@ -2104,8 +2107,14 @@ static void finalize_restore(void) xfree(ctl); - if ((item->pid->state == TASK_STOPPED) || (opts.final_state == TASK_STOPPED)) + if (opts.final_state == TASK_STOPPED) kill(item->pid->real, SIGSTOP); + else if (item->pid->state == TASK_STOPPED) { + if (item->pid->stop_signo > 0) + kill(item->pid->real, item->pid->stop_signo); + else + kill(item->pid->real, SIGSTOP); + } } } diff --git a/criu/include/pid.h b/criu/include/pid.h index 49cb2d322e..b2b7a361a6 100644 --- a/criu/include/pid.h +++ b/criu/include/pid.h @@ -31,6 +31,10 @@ struct pid { pid_t real; int state; /* TASK_XXX constants */ + /* If an item is in stopped state it has a signal number + * that caused task to stop. + */ + int stop_signo; /* * The @virt pid is one which used in the image itself and keeps diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 6b41a81db0..946b0fc40e 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1027,12 +1027,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) cr->s.sigpnd = 0; cr->s.shdpnd = 0; + cr->s.sigblk = 0; cr->s.seccomp_mode = SECCOMP_MODE_DISABLED; if (bfdopenr(&f)) return -1; - while (done < 13) { + while (done < 14) { str = breadline(&f); if (str == NULL) break; @@ -1143,13 +1144,23 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) goto err_parse; cr->s.sigpnd |= sigpnd; + done++; + continue; + } + if (!strncmp(str, "SigBlk:", 7)) { + unsigned long long sigblk = 0; + + if (sscanf(str + 7, "%llx", &sigblk) != 1) + goto err_parse; + cr->s.sigblk |= sigblk; + done++; continue; } } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 11 : 10); + expected_done = (parsed_seccomp ? 12 : 11); if (kdat.has_nspid) expected_done++; if (done == expected_done) diff --git a/criu/pstree.c b/criu/pstree.c index f4d77b3a49..72c4a3502a 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -222,6 +222,7 @@ struct pstree_item *__alloc_pstree_item(bool rst) item->pid->ns[0].virt = -1; item->pid->real = -1; item->pid->state = TASK_UNDEF; + item->pid->stop_signo = -1; item->born_sid = -1; item->pid->item = item; futex_init(&item->task_st); diff --git a/criu/seize.c b/criu/seize.c index 58564ca746..1333d6db97 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -615,6 +615,9 @@ static int collect_children(struct pstree_item *item) else processes_to_wait--; + if (ret == TASK_STOPPED) + c->pid->stop_signo = compel_parse_stop_signo(pid); + c->pid->real = pid; c->parent = item; c->pid->state = ret; @@ -646,7 +649,7 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) * the item->state is the state task was in when we seized one. */ - compel_resume_task(item->pid->real, item->pid->state, st); + compel_resume_task_sig(item->pid->real, item->pid->state, st, item->pid->stop_signo); if (st == TASK_DEAD) return; @@ -950,6 +953,9 @@ int collect_pstree(void) else processes_to_wait--; + if (ret == TASK_STOPPED) + root_item->pid->stop_signo = compel_parse_stop_signo(pid); + pr_info("Seized task %d, state %d\n", pid, ret); root_item->pid->state = ret; diff --git a/images/core.proto b/images/core.proto index 35079f366f..345bdca53b 100644 --- a/images/core.proto +++ b/images/core.proto @@ -60,6 +60,8 @@ message task_core_entry { // Reserved for container relative start time //optional uint64 start_time = 19; optional uint64 blk_sigset_extended = 20[(criu).hex = true]; + + optional uint32 stop_signo = 21; } message task_kobj_ids_entry { From c8f9880adab038481f7806173b698fc6e17ba76a Mon Sep 17 00:00:00 2001 From: Yuriy Vasiliev Date: Tue, 18 Jan 2022 14:35:55 +0100 Subject: [PATCH 034/122] zdtm: add tests for SIGTSTP stopped03 check that stopped by SIGTSTP tasks are restored correctly. stopped04 check that stopped by SIGSTOP tasks which have blocked SIGTSTP and have SIGTSTP pending are restored correctly. Signed-off-by: Yuriy Vasiliev --- test/zdtm/static/Makefile | 2 + test/zdtm/static/stopped03.c | 161 +++++++++++++++++++++++++++++++++++ test/zdtm/static/stopped04.c | 135 +++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 test/zdtm/static/stopped03.c create mode 100644 test/zdtm/static/stopped04.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index a3c1ccf4bf..5a8a5f75cb 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -184,6 +184,8 @@ TST_NOFILE := \ stopped01 \ stopped02 \ stopped12 \ + stopped03 \ + stopped04 \ rtc \ clean_mntns \ mntns_rw_ro_rw \ diff --git a/test/zdtm/static/stopped03.c b/test/zdtm/static/stopped03.c new file mode 100644 index 0000000000..85c7177f78 --- /dev/null +++ b/test/zdtm/static/stopped03.c @@ -0,0 +1,161 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +#define STOP_SIGNO SIGTSTP +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_CHECK, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} * sh; + +static int new_pgrp(void) +{ + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, STOP_SIGNO)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_CHECK); + + infop.si_code = 0; + infop.si_status = 0; + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + futex_set_and_wake(&sh->fstate, TEST_DONE); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + pr_err("Process is not in correct state before C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + futex_set_and_wake(&sh->fstate, TEST_CHECK); + futex_wait_while_lt(&sh->fstate, TEST_DONE); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + fail = 1; + pr_err("Process is not in correct state after C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} diff --git a/test/zdtm/static/stopped04.c b/test/zdtm/static/stopped04.c new file mode 100644 index 0000000000..237094ca43 --- /dev/null +++ b/test/zdtm/static/stopped04.c @@ -0,0 +1,135 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} * sh; + +static int new_pgrp(void) +{ + sigset_t sigset; + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + sigemptyset(&sigset); + sigaddset(&sigset, SIGTSTP); + sigprocmask(SIG_BLOCK, &sigset, NULL); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, SIGSTOP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + if (kill(pid, SIGTSTP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} From 290a998ec827dac96f22bdbc978dc931784b6d1e Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 30 May 2022 17:34:20 +0000 Subject: [PATCH 035/122] config/files-reg: Add opt to skip file r/w/x check on restore A file's r/w/x changing between checkpoint and restore does not necessarily imply that something is wrong. For example, if a process opens a file having perms rw- for reading and we change the perms to r--, the process can be restored and will function as expected. Therefore, this patch adds an option --skip-file-rwx-check to disable this check on restore. File validation is unaffected and should still function as expected with respect to the content of files. Signed-off-by: Younes Manton --- Documentation/criu.txt | 3 +++ criu/config.c | 1 + criu/cr-service.c | 3 +++ criu/crtools.c | 3 +++ criu/files-reg.c | 18 +++++++++++++++--- criu/include/cr_options.h | 1 + images/rpc.proto | 1 + lib/c/criu.c | 11 +++++++++++ lib/c/criu.h | 2 ++ 9 files changed, 40 insertions(+), 3 deletions(-) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 8b128f63ee..8d2e91443d 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -668,6 +668,9 @@ The 'mode' may be one of the following: build-ID cannot be obtained, 'chksm-first' method will be used. This is the default if mode is unspecified. +*--skip-file-rwx-check*:: + Skip checking file permissions (r/w/x for u/g/o) on restore. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to diff --git a/criu/config.c b/criu/config.c index 4023d807ca..24c445c8bd 100644 --- a/criu/config.c +++ b/criu/config.c @@ -696,6 +696,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097 }, { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), diff --git a/criu/cr-service.c b/criu/cr-service.c index a6eb9ebd30..1d9f0aca3b 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -464,6 +464,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_shell_job) opts.shell_job = req->shell_job; + if (req->has_skip_file_rwx_check) + opts.skip_file_rwx_check = req->skip_file_rwx_check; + if (req->has_file_locks) opts.handle_file_locks = req->file_locks; diff --git a/criu/crtools.c b/criu/crtools.c index cc8d9179fe..8bcbe8e38f 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -504,6 +504,9 @@ int main(int argc, char *argv[], char *envp[]) " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" + " --skip-file-rwx-check\n" + " Skip checking file permissions\n" + " (r/w/x for u/g/o) on restore.\n" "\n" "Check options:\n" " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/files-reg.c b/criu/files-reg.c index 0249063c26..ce87886373 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2199,9 +2199,21 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil if (!validate_file(tmp, &st, rfi)) goto err; - if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { - pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); - goto err; + if (rfi->rfe->has_mode) { + mode_t curr_mode = st.st_mode; + mode_t saved_mode = rfi->rfe->mode; + + if (opts.skip_file_rwx_check) { + curr_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + saved_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + } + + if (curr_mode != saved_mode) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n" + "File r/w/x checks can be skipped with the --skip-file-rwx-check option\n", + rfi->path, (int)curr_mode, saved_mode); + goto err; + } } /* diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index bf1a762cc6..e544a2d9a1 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -179,6 +179,7 @@ struct cr_options { bool lazy_pages; char *work_dir; int network_lock_method; + int skip_file_rwx_check; /* * When we scheduler for removal some functionality we first diff --git a/images/rpc.proto b/images/rpc.proto index a6cc5da487..3cf431639c 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -138,6 +138,7 @@ message criu_opts { optional string lsm_mount_context = 63; optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; optional bool mntns_compat_mode = 65; + optional bool skip_file_rwx_check = 66; /* optional bool check_mounts = 128; */ } diff --git a/lib/c/criu.c b/lib/c/criu.c index 7807d7bc58..8171f7a126 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -555,6 +555,17 @@ void criu_set_shell_job(bool shell_job) criu_local_set_shell_job(global_opts, shell_job); } +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check) +{ + opts->rpc->has_skip_file_rwx_check = true; + opts->rpc->skip_file_rwx_check = skip_file_rwx_check; +} + +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) +{ + criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); +} + void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; diff --git a/lib/c/criu.h b/lib/c/criu.h index 7cc6a199c2..c32a8a6462 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -78,6 +78,7 @@ void criu_set_tcp_close(bool tcp_close); void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); @@ -238,6 +239,7 @@ void criu_local_set_tcp_close(criu_opts *opts, bool tcp_close); void criu_local_set_weak_sysctls(criu_opts *opts, bool val); void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices); void criu_local_set_shell_job(criu_opts *opts, bool shell_job); +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check); void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master); void criu_local_set_file_locks(criu_opts *opts, bool file_locks); void criu_local_set_track_mem(criu_opts *opts, bool track_mem); From 8f04c131cb71295d2f496fa7889d0e6995baf77c Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 3 Jun 2022 09:47:11 -0700 Subject: [PATCH 036/122] Add --skip-file-rwx-check opt test Add a simple test using tail to check that processes can't be restored by default when the r/w/x mode of an open file changes, unless --skip-file-rwx-check is used. Signed-off-by: Younes Manton --- scripts/ci/run-ci-tests.sh | 1 + test/Makefile | 2 +- test/others/skip-file-rwx-check/Makefile | 7 +++++ test/others/skip-file-rwx-check/run.sh | 37 ++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 test/others/skip-file-rwx-check/Makefile create mode 100755 test/others/skip-file-rwx-check/run.sh diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 8d9de6e55f..3760a65e3d 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -260,6 +260,7 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling diff --git a/test/Makefile b/test/Makefile index 8416b19619..e8fcffe3fc 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job skip-file-rwx-check other: for t in $(TESTS); do \ diff --git a/test/others/skip-file-rwx-check/Makefile b/test/others/skip-file-rwx-check/Makefile new file mode 100644 index 0000000000..419d592b73 --- /dev/null +++ b/test/others/skip-file-rwx-check/Makefile @@ -0,0 +1,7 @@ +.PHONY: run clean + +run: + ./run.sh + +clean: + rm -rf testfile *.img dump.log restore-expected-fail.log restore.log stats-dump stats-restore diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh new file mode 100755 index 0000000000..0803d78eca --- /dev/null +++ b/test/others/skip-file-rwx-check/run.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +source ../env.sh + +make clean +touch testfile +chmod +w testfile +tail --follow testfile & +tailpid=$! +if ! "$criu" dump --tree=$tailpid --shell-job --verbosity=4 --log-file=dump.log +then + kill $tailpid + echo "Failed to dump process as expected" + echo FAIL + exit 1 +fi +chmod -w testfile +if "$criu" restore --restore-detached --shell-job --verbosity=4 --log-file=restore-expected-fail.log +then + kill $tailpid + echo "Unexpectedly restored process with reference to a file who's r/w/x perms changed when --skip-file-rwx-check option was not used" + echo FAIL + exit 1 +fi +if ! "$criu" restore --skip-file-rwx-check --restore-detached --shell-job --verbosity=4 --log-file=restore.log +then + echo "Failed to restore process with reference to a file who's r/w/x perms changed when --skip-file-rwx-check option was used" + echo FAIL + exit 1 +fi +kill $tailpid +echo PASS From 1e6e826ffb7ac05f33fa123051c2fc2ddf0f68ea Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 8 Jul 2022 12:36:57 +0000 Subject: [PATCH 037/122] rseq: fix headers conflict on Mariner GNU/Linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. For some reason, Marier distribution headers not correctly define __GLIBC_HAVE_KERNEL_RSEQ compile-time constant. It remains undefined, but in fact header files provides corresponding rseq types declaration which leads to conflict. 2. Another issue, is that they use uint*_t types instead of __u* types as in original rseq.h. This leads to compile time issues like this: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type 'uint64_t' {aka 'long unsigned int'} and we can't even replace %llx to %PRIx64 because it will break compilation on other distros (like Fedora) with analogical error: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 6 has type ‘__u64’ {aka ‘long long unsigned int’} Let's use our-own struct rseq copy fully equal to the kernel one, it's safe because this structure is a part of Linux Kernel ABI. Fixes #1934 Reported-by: Nikola Bojanic Signed-off-by: Alexander Mikhalitsyn --- Makefile.config | 3 ++- criu/cr-dump.c | 15 ++++++++------- criu/include/linux/rseq.h | 20 ++++++++++++++------ criu/include/pstree.h | 2 +- scripts/feature-tests.mak | 19 +++++++++++++++++++ 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/Makefile.config b/Makefile.config index d46d84f2de..d113e2246b 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,7 +78,8 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE OPENAT2 + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE \ + OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test diff --git a/criu/cr-dump.c b/criu/cr-dump.c index e60da88ed7..210f662323 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1039,7 +1039,7 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } -static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct rseq_cs *rseq_cs, +static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct criu_rseq_cs *rseq_cs, struct criu_rseq *rseq) { int ret; @@ -1070,10 +1070,11 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, st if (!rseq->rseq_cs) return 0; - ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct rseq_cs)); + ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct criu_rseq_cs)); if (ret) { pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, - (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, (unsigned long)sizeof(struct rseq_cs)); + (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, + (unsigned long)sizeof(struct criu_rseq_cs)); return -1; } @@ -1088,7 +1089,7 @@ static int dump_thread_rseq(struct pstree_item *item, int i) CoreEntry *core = item->core[i]; RseqEntry **rseqep = &core->thread_core->rseq_entry; struct criu_rseq rseq = {}; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* @@ -1154,7 +1155,7 @@ static int dump_thread_rseq(struct pstree_item *item, int i) static int dump_task_rseq(pid_t pid, struct pstree_item *item) { int i; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* if rseq() syscall isn't supported then nothing to dump */ if (!kdat.has_rseq) @@ -1179,7 +1180,7 @@ static int dump_task_rseq(pid_t pid, struct pstree_item *item) return -1; } -static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) +static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) { return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; } @@ -1187,7 +1188,7 @@ static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) static int fixup_thread_rseq(struct pstree_item *item, int i) { CoreEntry *core = item->core[i]; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* equivalent to (struct rseq)->rseq_cs is NULL */ diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h index a47876e669..5ceefbf8e1 100644 --- a/criu/include/linux/rseq.h +++ b/criu/include/linux/rseq.h @@ -9,7 +9,12 @@ #endif #endif -#ifndef __GLIBC_HAVE_KERNEL_RSEQ +#include +#include + +#include "common/config.h" + +#ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS /* * linux/rseq.h * @@ -18,9 +23,6 @@ * Copyright (c) 2015-2018 Mathieu Desnoyers */ -#include -#include - enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, RSEQ_CPU_ID_REGISTRATION_FAILED = -2, @@ -41,13 +43,20 @@ enum rseq_cs_flags { RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), }; +#endif /* CONFIG_HAS_NO_LIBC_RSEQ_DEFS */ +/* + * Let's use our own definition of struct rseq_cs because some distros + * (for example Mariner GNU/Linux) declares this structure their-own way. + * This makes trouble with inconsistency between printf formatters and + * struct rseq_cs field types. + */ /* * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. It is usually declared as * link-time constant data. */ -struct rseq_cs { +struct criu_rseq_cs { /* Version of this structure. */ __u32 version; /* enum rseq_cs_flags */ @@ -57,7 +66,6 @@ struct rseq_cs { __u64 post_commit_offset; __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); -#endif /* __GLIBC_HAVE_KERNEL_RSEQ */ /* * We have to have our own copy of struct rseq definition because diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 8ae750e1af..1137046d43 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -63,7 +63,7 @@ struct dmp_info { struct parasite_ctl *parasite_ctl; struct parasite_thread_ctl **thread_ctls; uint64_t *thread_sp; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* * Although we don't support dumping different struct creds in general, diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 592552cb8e..014e893a84 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -196,3 +196,22 @@ int main(void) return 0; } endef + +define FEATURE_TEST_NO_LIBC_RSEQ_DEFS + +#ifdef __has_include +#if __has_include(\"sys/rseq.h\") +#include +#endif +#endif + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +int main(void) +{ + return 0; +} +endef From 90c0f0874ed51c963b90bdb434dd48fe1d3bfb0a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 10 May 2022 13:37:09 +0200 Subject: [PATCH 038/122] x86/compel/fault-inject: fixup mxcsr for PTRACE_SETFPREGS Error from: ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst (00.003111) Dumping GP/FPU registers for 56 (00.003121) Error (compel/arch/x86/src/lib/infect.c:310): Corrupting fpuregs for 56, seed 1651766595 (00.003125) Error (compel/arch/x86/src/lib/infect.c:314): Can't set FPU registers for 56: Invalid argument (00.003129) Error (compel/src/lib/infect.c:688): Can't obtain regs for thread 56 (00.003174) Error (criu/cr-dump.c:1564): Can't infect (pid: 56) with parasite See also: 145e9e0d8c6 ("x86/fpu: Fail ptrace() requests that try to set invalid MXCSR values") https://github.com/torvalds/linux/commit/145e9e0d8c6fada4a40f9fc65b34658077874d9c We decided to move from mxcsr cleaning up scheme and use mxcsr mask (0x0000ffbf) as kernel does. Thanks to Dmitry Safonov for pointing out. Tested-on: Intel(R) Xeon(R) CPU E3-1246 v3 @ 3.50GHz Reported-by: Mr. Jenkins Suggested-by: Dmitry Safonov Signed-off-by: Alexander Mikhalitsyn --- compel/arch/x86/src/lib/infect.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 98e2512e7c..c0e7a544a0 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -245,6 +245,19 @@ static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) return 0; } +static inline void fixup_mxcsr(struct xsave_struct *xsave) +{ + /* + * Right now xsave->i387.mxcsr filled with the random garbage, + * let's make it valid by applying mask which allows all + * features, except the denormals-are-zero feature bit. + * + * See also fpu__init_system_mxcsr function: + * https://github.com/torvalds/linux/blob/8cb1ae19/arch/x86/kernel/fpu/init.c#L117 + */ + xsave->i387.mxcsr &= 0x0000ffbf; +} + /* See arch/x86/kernel/fpu/xstate.c */ static void validate_random_xstate(struct xsave_struct *xsave) { @@ -272,17 +285,6 @@ static void validate_random_xstate(struct xsave_struct *xsave) /* No reserved bits may be set */ memset(&hdr->reserved, 0, sizeof(hdr->reserved)); - - /* - * While using PTRACE_SETREGSET the kernel checks that - * "Reserved bits in MXCSR must be zero." - * if (mxcsr[0] & ~mxcsr_feature_mask) - * return -EINVAL; - * - * As the mxcsr_feature_mask depends on the CPU the easiest solution for - * this error injection test is to set mxcsr just to zero. - */ - xsave->i387.mxcsr = 0; } /* @@ -309,6 +311,8 @@ static int corrupt_extregs(pid_t pid) */ pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); + fixup_mxcsr(&ext_regs); + if (!use_xsave) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); From ebe9db972419da4f9cf4ecd1bdc3bfd7097c28cc Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 21 Jul 2022 22:46:10 +0700 Subject: [PATCH 039/122] zdtm: Remove permission part check for skipping vsyscall vma Normally, vsyscall vma has VM_READ, VM_EXEC permission. However, when CONFIG_LEGACY_VSYSCALL_XONLY=y, that vma only has VM_EXEC. This commit removes the permission part when checking to skip vsyscall vma in x32 tests. Signed-off-by: Bui Quang Minh --- test/zdtm.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index c011c79c0e..d264c4878e 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1651,6 +1651,15 @@ def get_visible_state(test): return files, maps, mounts +def has_vsyscall(maps): + vsyscall = u"ffffffffff600000-ffffffffff601000" + for i in maps: + if vsyscall in i: + return i + + return None + + def check_visible_state(test, state, opts): new = get_visible_state(test) @@ -1666,9 +1675,9 @@ def check_visible_state(test, state, opts): new_maps = new[1][pid] if os.getenv("COMPAT_TEST"): # the vsyscall vma isn't unmapped from x32 processes - vsyscall = u"ffffffffff600000-ffffffffff601000 r-xp" - if vsyscall in new_maps and vsyscall not in old_maps: - new_maps.remove(vsyscall) + entry = has_vsyscall(new_maps) + if entry and has_vsyscall(old_maps) is None: + new_maps.remove(entry) if old_maps != new_maps: print("%s: Old maps lost: %s" % (pid, old_maps - new_maps)) print("%s: New maps appeared: %s" % (pid, new_maps - old_maps)) From e15690ba190ddd48074588378034a83015dd97c7 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 21 Jul 2022 21:26:58 +0700 Subject: [PATCH 040/122] vdso-compat: Increase the reserved buffer for compat vdso On Arch Linux with 5.18.3-zen1-1-zen kernel, the vdso's size is 3 pages which exceeds the current 2-page reserved buffer. This commit simply increases the reserved buffer size to 4 pages. Fixes: https://github.com/checkpoint-restore/criu/issues/1916 Signed-off-by: Bui Quang Minh --- criu/vdso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/vdso.c b/criu/vdso.c index 1a51f1451d..7de2fae784 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -479,7 +479,7 @@ static int vdso_mmap_compat(struct vdso_maps *native, struct vdso_maps *compat, return ret; } -#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 2) +#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 4) static int vdso_fill_compat_symtable(struct vdso_maps *native, struct vdso_maps *compat) { void *vdso_mmap; From 973b4b631577e896f07dc7126e5cdc0af27515e5 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 26 Jul 2022 13:20:58 +0300 Subject: [PATCH 041/122] zdtm: make root mount private in criu mntns If root mount in criu mntns is slave, it would be slave of host mount where criu is stored, so if someone mounts something in subdir of {criu-dir}/test/ on host while tests are running this mount can influence the test as it appears on top of root mount in criu mntns. 1) With mount-compat this mount can get into restored test mntns, which means wrong restore, as this mount was not there on dump. 2) With mount-v2 this mount would just fail container restore, as root container mount is mounted non-recursively to protect from unexpected mounts appear after restore. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index d264c4878e..aefcb36a4f 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -267,7 +267,7 @@ def __copy_deps(self, deps): def init(self, l_bins, x_bins): subprocess.check_call( - ["mount", "--make-slave", "--bind", ".", self.root]) + ["mount", "--make-private", "--bind", ".", self.root]) self.root_mounted = True if not os.access(self.root + "/.constructed", os.F_OK): From 0576f68d0ac0a781b326168508d507c0ac56914c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 27 Jul 2022 16:03:25 +0300 Subject: [PATCH 042/122] zdtm/mnt_root_ext: don't allow propagation from test mntns to criu mntns This test specifically wants to create external bind-mount of "/" from criu mntns to test mntns, and it wants "/" in criu mntns to be a shared mount so that "external" mount in the test mntns is it's slave. This is to triger specific dirname() resolution which happens only when sharing restore is involved for external mounts, and only if rootfs is involved. But initially I missed that when we create external mount in test's temporary mntns it creates a propagation in criu mntns on top of root mount. This mount may influence other tests restore as child mount in root mount converts to locked child mount in criu service mntns (for uns flavour) and when criu would restore root container mount it would fail with EINVAL on non recursive bind with locked children. To fix this mess we just need to prohibit propagating from tests temporary mntns to criu mntns by making mounts slave. Fixes: #1941 Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/mnt_root_ext.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/zdtm/static/mnt_root_ext.c b/test/zdtm/static/mnt_root_ext.c index 6a2eb068c6..305e872627 100644 --- a/test/zdtm/static/mnt_root_ext.c +++ b/test/zdtm/static/mnt_root_ext.c @@ -51,6 +51,14 @@ int main(int argc, char **argv) return 1; } + /* + * Make mounts in temporary mntns slave, to prevent propagation to criu mntns + */ + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { + pr_perror("make rslave"); + return 1; + } + /* * Populate to the tests root host's rootfs subdir */ From 2549276667c95f7d9a7431e4b1a82829706d093e Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Tue, 26 Jul 2022 23:39:33 +0800 Subject: [PATCH 043/122] files-reg.c: modify the check of ghost_limit to support large sparse files files-reg.c checks whether the file size is larger than ghost_limit with st_size (in dump_ghost_remap), which can not deal with large ghost sparse file, since its actual file size is not the same as what st_size shows. Therefore, in this commit, I replace st_size with st_blocks, which shows the actual file size. (1 block = 512B), thus criu can deal with large ghost sparse file. Signed-off-by: Liang-Chun Chen --- criu/files-reg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index ce87886373..c3761b5ed7 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -29,6 +29,7 @@ * and checked. */ #define BUILD_ID_MAP_SIZE 1048576 +#define ST_UNIT 512 #include "cr_options.h" #include "imgset.h" @@ -946,8 +947,8 @@ static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); - if (st->st_size > opts.ghost_limit) { - pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_size); + if (st->st_blocks * ST_UNIT > opts.ghost_limit) { + pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_blocks * ST_UNIT); return -1; } From 4cc4d1d1bb802e2108025f3715aff3dffa68b8a5 Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Wed, 27 Jul 2022 01:45:00 +0800 Subject: [PATCH 044/122] unlink_largefile.desc: remove crfail, since criu now can support unlink_largefile test In the past, the unlink_largefile test should be fail on large ghost file. However, it used sparse file, it will pass in current criu, since the large ghost sparse file issue was fixed. So the crfail flag of this test should be removed. Signed-off-by: Liang-Chun Chen --- test/zdtm/static/unlink_largefile.desc | 1 - 1 file changed, 1 deletion(-) delete mode 100644 test/zdtm/static/unlink_largefile.desc diff --git a/test/zdtm/static/unlink_largefile.desc b/test/zdtm/static/unlink_largefile.desc deleted file mode 100644 index ded89879a9..0000000000 --- a/test/zdtm/static/unlink_largefile.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'crfail'} From d9009f6a3f744ba4e0e016532aa01361df29e60f Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Thu, 28 Jul 2022 13:09:29 +0800 Subject: [PATCH 045/122] zdtm: add two tests for large ghost sparse file ghost_holes_large00 is a test which creates a large ghost sparse file with 1GiB hole(pwrite can only handle 2GiB maximum on 32-bit system) and 8KiB data, criu should be able to handle this kind of situation. ghost_holes_large01 is a test which creates a large ghost sparse file with 1GiB hole and 2MiB data, since 2MiB is larger than the default ghost_limit(1MiB), criu should fail on this test. v2: fix overflow on 32-bit arch. Signed-off-by: Liang-Chun Chen --- test/zdtm/static/Makefile | 3 + test/zdtm/static/ghost_holes_large00.c | 152 ++++++++++++++++++++++ test/zdtm/static/ghost_holes_large01.c | 1 + test/zdtm/static/ghost_holes_large01.desc | 1 + 4 files changed, 157 insertions(+) create mode 100644 test/zdtm/static/ghost_holes_large00.c create mode 120000 test/zdtm/static/ghost_holes_large01.c create mode 100644 test/zdtm/static/ghost_holes_large01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 5a8a5f75cb..b28345400c 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -306,6 +306,8 @@ TST_FILE = \ ghost_holes00 \ ghost_holes01 \ ghost_holes02 \ + ghost_holes_large00 \ + ghost_holes_large01 \ unlink_largefile \ mtime_mmap \ fifo \ @@ -609,6 +611,7 @@ unlink_fstat04: CFLAGS += -DUNLINK_FSTAT04 unlink_fstat041: CFLAGS += -DUNLINK_FSTAT041 -DUNLINK_FSTAT04 ghost_holes01: CFLAGS += -DTAIL_HOLE ghost_holes02: CFLAGS += -DHEAD_HOLE +ghost_holes_large01: CFLAGS += -DLIMIT sk-freebind-false: CFLAGS += -DZDTM_FREEBIND_FALSE selinux02: CFLAGS += -DUSING_SOCKCREATE stopped01: CFLAGS += -DZDTM_STOPPED_KILL diff --git a/test/zdtm/static/ghost_holes_large00.c b/test/zdtm/static/ghost_holes_large00.c new file mode 100644 index 0000000000..1a9739f8e9 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large00.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with one large hole(1GiB) in the middle"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for data size */ +#ifdef LIMIT +#define BUFSIZE 1024 * 1024 +#else +#define BUFSIZE 4096 +#endif +static unsigned char buf[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define DATA1_OFF 0 +#define HOLE_SIZE (1LL * 1 * 1024 * 1024 * 1024) +#define DATA2_OFF (BUFSIZE + HOLE_SIZE) +#define FILE_SIZE (2 * BUFSIZE + HOLE_SIZE) +#define ST_UNIT 512 + +int main(int argc, char **argv) +{ + int fd; + struct stat st; + uint32_t crc; + bool chk_hole = true; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + pr_perror("can't write data1"); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + pr_perror("can't write data2"); + goto failed; + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + test_msg("Won't check for hole\n"); + chk_hole = false; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("file size OK\n"); + + if (st.st_blocks * ST_UNIT != 2 * BUFSIZE) { + fail("actual file size changed to %ld", (long)st.st_blocks * ST_UNIT); + goto failed; + } + + test_msg("actual file size OK\n"); + + /* Data 1 */ + if (pread(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + fail("pread1 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk1 fail"); + goto failed; + } + + test_msg("Data1 OK\n"); + + /* Data 2 */ + if (pread(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + fail("pread2 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk2 fail"); + goto failed; + } + + test_msg("Data2 OK\n"); + + /* Hole */ + if (chk_hole) { + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + fail("Begin of mid hole not found"); + goto failed; + } + if (lseek(fd, DATA1_OFF + BUFSIZE, SEEK_DATA) != DATA2_OFF) { + fail("End of mid hole not found"); + goto failed; + } + test_msg("Mid hole OK\n"); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_holes_large01.c b/test/zdtm/static/ghost_holes_large01.c new file mode 120000 index 0000000000..1b90363d45 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.c @@ -0,0 +1 @@ +ghost_holes_large00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_holes_large01.desc b/test/zdtm/static/ghost_holes_large01.desc new file mode 100644 index 0000000000..8e6a476bd7 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} \ No newline at end of file From 8a0185968a248cd13c541d76fc5eafe86c7a6204 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 4 Aug 2022 16:56:30 +0100 Subject: [PATCH 046/122] MAINTAINERS: Add Radostin (myself) to maintainers I've been contributing to CRIU for sometime and I'm hoping that my familiarity with the project would be sufficient to self-nominate as a maintainer. I would like to help with code reviews, submitting patches, implementing new features, and maintaining the project in general. Signed-off-by: Radostin Stoyanov --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index bb153f1ab0..7d53d0dc1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4,3 +4,4 @@ Mike Rapoport Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov +Radostin Stoyanov From f32e626e42c5f53014e5db04ff0bb0516066707e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 23 Jul 2022 18:23:34 +0100 Subject: [PATCH 047/122] ci: unset XDG_RUNTIME_DIR when invoking podman We need to pass environment variables from the CI environment to distinguish between CI environments. However, when `sudo -E` is used to run Podman it results in the XDG_RUNTIME_DIR environment variable being set incorrectly that prevents Podman from running. This patch fixes the following error in the GitHub Action virtual environment: error running container: error from /usr/bin/crun creating container for [/bin/sh -c /bin/prepare-for-fedora-rawhide.sh]: sd-bus call: Connection reset by peer Fixes: #1942 Signed-off-by: Radostin Stoyanov --- .github/workflows/fedora-rawhide-test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 00bc3b2bda..b6d94d23ed 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -9,4 +9,8 @@ jobs: steps: - uses: actions/checkout@v2 - name: Run Fedora Rawhide Test - run: sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" + # We need to pass environment variables from the CI environment to + # distinguish between CI environments. However, we need to make sure that + # XDG_RUNTIME_DIR environment variable is not set due to a bug in Podman. + # FIXME: https://github.com/containers/podman/issues/14920 + run: sudo -E XDG_RUNTIME_DIR= make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" From 4c86d6a7d54abb64fc5a15131f3351224e8c071b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 31 Jul 2022 16:07:30 +0000 Subject: [PATCH 048/122] criu: fix conflicting headers There are several changes in glibc 2.36 that make sys/mount.h header incompatible with kernel headers: https://sourceware.org/glibc/wiki/Release/2.36#Usage_of_.3Clinux.2Fmount.h.3E_and_.3Csys.2Fmount.h.3E This patch removes conflicting includes for `` and updates the content of `criu/include/linux/mount.h` to match `/usr/include/sys/mount.h`. In addition, inline definitions sys_*() functions have been moved from "linux/mount.h" to "syscall.h" to avoid conflicts with `uapi/compel/plugins/std/syscall.h` and ``. The include for `` has been replaced with local include to avoid conflicts with ``. Fixes: #1949 Signed-off-by: Radostin Stoyanov --- Makefile.config | 2 +- criu/cgroup.c | 1 + criu/cr-check.c | 2 +- criu/cr-restore.c | 3 ++- criu/include/aio.h | 2 +- criu/include/linux/aio_abi.h | 14 +++++++++++ criu/include/linux/mount.h | 48 +++++++++++++++++++----------------- criu/include/syscall.h | 17 +++++++++++++ criu/pie/parasite.c | 2 +- criu/util.c | 1 + scripts/feature-tests.mak | 13 ---------- 11 files changed, 64 insertions(+), 41 deletions(-) create mode 100644 criu/include/linux/aio_abi.h create mode 100644 criu/include/syscall.h diff --git a/Makefile.config b/Makefile.config index d113e2246b..270ec61c0f 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,7 +78,7 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE \ + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name diff --git a/criu/cgroup.c b/criu/cgroup.c index e05b0832ed..325df6a1db 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -27,6 +27,7 @@ #include "images/cgroup.pb-c.h" #include "kerndat.h" #include "linux/mount.h" +#include "syscall.h" /* * This structure describes set of controller groups diff --git a/criu/cr-check.c b/criu/cr-check.c index f589a91da1..0ca80192ce 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,7 +21,6 @@ #include #include #include -#include #include "../soccr/soccr.h" @@ -52,6 +51,7 @@ #include "net.h" #include "restorer.h" #include "uffd.h" +#include "linux/aio_abi.h" #include "images/inventory.pb-c.h" diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 279246c190..d11d28173a 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -22,7 +22,6 @@ #include #include "common/compiler.h" -#include "linux/mount.h" #include "linux/rseq.h" #include "clone-noasan.h" @@ -86,6 +85,8 @@ #include #include "compel/include/asm/syscall.h" +#include "linux/mount.h" + #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" diff --git a/criu/include/aio.h b/criu/include/aio.h index d1655739d9..38e7040209 100644 --- a/criu/include/aio.h +++ b/criu/include/aio.h @@ -1,7 +1,7 @@ #ifndef __CR_AIO_H__ #define __CR_AIO_H__ -#include +#include "linux/aio_abi.h" #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); diff --git a/criu/include/linux/aio_abi.h b/criu/include/linux/aio_abi.h new file mode 100644 index 0000000000..d9ce787203 --- /dev/null +++ b/criu/include/linux/aio_abi.h @@ -0,0 +1,14 @@ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +typedef __kernel_ulong_t aio_context_t; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#endif /* __LINUX__AIO_ABI_H */ diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index 9a3a28b100..0d55a588cb 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -4,32 +4,34 @@ #include "common/config.h" #include "compel/plugins/std/syscall-codes.h" -#ifdef CONFIG_HAS_FSCONFIG -#include -#else +/* Copied from /usr/include/sys/mount.h */ + +#ifndef FSCONFIG_CMD_CREATE +/* The type of fsconfig call made. */ enum fsconfig_command { - FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ - FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ - FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ - FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ - FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ - FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ +#define FSCONFIG_SET_FLAG FSCONFIG_SET_FLAG + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ +#define FSCONFIG_SET_STRING FSCONFIG_SET_STRING + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ +#define FSCONFIG_SET_BINARY FSCONFIG_SET_BINARY + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ +#define FSCONFIG_SET_PATH FSCONFIG_SET_PATH + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ +#define FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_PATH_EMPTY + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ +#define FSCONFIG_SET_FD FSCONFIG_SET_FD + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ +#define FSCONFIG_CMD_CREATE FSCONFIG_CMD_CREATE FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +#define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; -#endif +#endif // FSCONFIG_CMD_CREATE -static inline int sys_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} -static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - return syscall(__NR_fsconfig, fd, cmd, key, value, aux); -} -static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - return syscall(__NR_fsmount, fd, flags, attr_flags); -} +#ifndef MS_MGC_VAL +/* Magic mount flag number. Has to be or-ed to the flag values. */ +#define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */ +#define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */ +#endif #endif diff --git a/criu/include/syscall.h b/criu/include/syscall.h new file mode 100644 index 0000000000..c38d6d971b --- /dev/null +++ b/criu/include/syscall.h @@ -0,0 +1,17 @@ +#ifndef __CR_SYSCALL_H__ +#define __CR_SYSCALL_H__ + +static inline int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} +static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} +static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +#endif /* __CR_SYSCALL_H__ */ \ No newline at end of file diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index e7eb1fcb60..f75fe13bb6 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -14,6 +13,7 @@ #include "int.h" #include "types.h" #include +#include "linux/mount.h" #include "parasite.h" #include "fcntl.h" #include "prctl.h" diff --git a/criu/util.c b/criu/util.c index 5f69465b44..060ca3bd44 100644 --- a/criu/util.c +++ b/criu/util.c @@ -40,6 +40,7 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" +#include "syscall.h" #include "clone-noasan.h" #include "cr_options.h" diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 014e893a84..fb5d2ef7ad 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -137,19 +137,6 @@ ENTRY(main) END(main) endef -define FEATURE_TEST_FSCONFIG - -#include - -int main(void) -{ - if (FSCONFIG_CMD_CREATE > 0) - return 0; - return 0; -} - -endef - define FEATURE_TEST_NFTABLES_LIB_API_0 #include From 6a1260a7ca1b09d62efa674b8f8a645780dd0abd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Aug 2022 21:44:11 +0100 Subject: [PATCH 049/122] Revert "ci: Switch to non overlaysfs tests" This reverts commit 8bb05e3bf3fe96ce93071e22330c2701e86b9a55. The following bug has been fixed: https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 5 +---- scripts/ci/podman-test.sh | 6 +----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index f36b4e4581..d4b11bd551 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -21,10 +21,7 @@ add-apt-repository \ . /etc/lsb-release -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use devicemapper storage drive as a work-around -echo '{ "experimental": true, "storage-driver": "devicemapper" }' > /etc/docker/daemon.json +echo '{ "experimental": true }' > /etc/docker/daemon.json CRIU_LOG='/criu.log' mkdir -p /etc/criu diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 414004514b..973d2d722a 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -25,11 +25,7 @@ make install popd rm -rf "${tmp_dir}" -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use VFS storage drive as a work-around -export STORAGE_DRIVER=vfs -podman --storage-driver vfs info +podman info # shellcheck disable=SC2016 podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' From 557ab8c4c8627bf20903f45cbf0f4d68ff195ccb Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Aug 2022 22:00:16 +0100 Subject: [PATCH 050/122] docker-test: use containerd installed from package In commits [1, 2] the version of containerd installed by default in the GitHub CI virtual environment was replaced with the latest release from GitHub as a workaround to a bug in containerd. This bug has been fixed sometime ago and the current default version of containerd (1.6.6) does not require this workaround. However, with the latest release, the containerd binaries uploaded on GitHub have been built for Ubuntu 22.04 [3]. Our tests are still running on Ubuntu 20.04 and this results in the following error: /usr/bin/containerd: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /usr/bin/containerd) /usr/bin/containerd: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /usr/bin/containerd) [1] https://github.com/checkpoint-restore/criu/commit/046cad8 [2] https://github.com/checkpoint-restore/criu/commit/81a68ad [3] https://github.com/containerd/containerd/commit/6b2dc9a37 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index d4b11bd551..63941437e3 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -21,24 +21,14 @@ add-apt-repository \ . /etc/lsb-release +# docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf -service docker stop -systemctl stop containerd.service - -# Always use the latest containerd release. -# Restore with containerd versions after v1.2.14 and before v1.5.0-beta.0 are broken. -# https://github.com/checkpoint-restore/criu/issues/1223 -CONTAINERD_DOWNLOAD_URL=$(curl -s https://api.github.com/repos/containerd/containerd/releases/latest | grep '"browser_download_url":.*/containerd-.*-linux-amd64.tar.gz.$' | cut -d\" -f4) -wget -nv "$CONTAINERD_DOWNLOAD_URL" -O - | tar -xz -C /usr/ - -systemctl restart containerd.service -service docker restart - export SKIP_CI_TEST=1 ./run-ci-tests.sh From 5f801c41296a66c924abdd443866af88aae286ea Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Aug 2022 14:11:49 +0100 Subject: [PATCH 051/122] cr-check: fix check for apparmor stacking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The feature check for AppArmor stacking was introduced in commit: 8723e3f998d1ec5f125e6600436a96f7ff9c1631 check: add a feature test for apparmor_stacking However, on systems that don't support AppArmour, this check always fails. As a result, `criu check --all` shows the following message: Looks good but some kernel features are missing which, depending on your process tree, may cause dump or restore failure. Reported-by: André Rösti (@andrej) Signed-off-by: Radostin Stoyanov --- criu/cr-check.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0ca80192ce..0f09b902a0 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1478,13 +1478,15 @@ int cr_check(void) ret |= check_newifindex(); ret |= check_pidfd_store(); ret |= check_ns_pid(); - ret |= check_apparmor_stacking(); ret |= check_network_lock_nftables(); ret |= check_sockopt_buf_lock(); ret |= check_memfd_hugetlb(); ret |= check_move_mount_set_group(); ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); + + if (kdat.lsm == LSMTYPE__APPARMOR) + ret |= check_apparmor_stacking(); } /* From ce1b705b157019c873fba33a5b427bafa2af356c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sat, 6 Aug 2022 22:03:15 +0100 Subject: [PATCH 052/122] cr-check: optimize check for apparmor stacking The result of check_aa_ns_dumping() is stored in kdat. Instead of doing the same check twice - once on kerndat_init(), and again in check_apparmor_stacking(), we can check the stored value. Suggested-by: Pavel Tikhomirov Signed-off-by: Radostin Stoyanov --- criu/cr-check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0f09b902a0..6c95ffb254 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -104,7 +104,7 @@ static int check_tty(void) static int check_apparmor_stacking(void) { - if (!check_aa_ns_dumping()) + if (!kdat.apparmor_ns_dumping_enabled) return -1; return 0; From f0b0a64d1a2339a14de621c6c42516a03a2e95bc Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Wed, 20 Jul 2022 14:36:28 +0300 Subject: [PATCH 053/122] cr-restore: rseq: dynamically handle *libc with rseq Before this patch we assumed that CRIU is compiled against the same GLibc as it runs with. But as we see from real world examples like #1935 it's not always true. The idea of this patch is to detect rseq configuration for the main CRIU process and use it to unregister rseq for all further child processes. It's correct, because we restore pstree using clone*() syscalls, don't use exec*() (!) syscalls, so rseq gets inherited in the kernel and rseq configuration remains the same for all children processes. This will prevent issues like this: https://github.com/checkpoint-restore/criu/issues/1935 Suggested-by: Florian Weimer Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 16 ++++++++-------- criu/include/kerndat.h | 2 ++ criu/kerndat.c | 25 +++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d11d28173a..5b5b41dfc8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3103,14 +3103,14 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) #else static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { - /* - * TODO: handle built-in rseq on other libc'ies like musl - * We can do that using get_rseq_conf kernel feature. - * - * For now we just assume that other libc libraries are - * not registering rseq by default. - */ - rseq->rseq_abi_pointer = 0; + if (!kdat.has_rseq || !kdat.has_ptrace_get_rseq_conf) { + rseq->rseq_abi_pointer = 0; + return; + } + + rseq->rseq_abi_pointer = kdat.libc_rseq_conf.rseq_abi_pointer; + rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; + rseq->signature = kdat.libc_rseq_conf.signature; } #endif diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 83d867e75b..a3959c9926 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -7,6 +7,7 @@ #include "asm/kerndat.h" #include "util-vdso.h" #include "hugetlb.h" +#include struct stat; @@ -82,6 +83,7 @@ struct kerndat_s { bool has_openat2; bool has_rseq; bool has_ptrace_get_rseq_conf; + struct __ptrace_rseq_configuration libc_rseq_conf; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index bc5dccab18..0f7d5fc8fb 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -923,6 +923,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void) pid_t pid; int len; struct __ptrace_rseq_configuration rseq; + int ret = 0; pid = fork_and_ptrace_attach(NULL); if (pid < 0) @@ -930,6 +931,9 @@ static int kerndat_has_ptrace_get_rseq_conf(void) len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); if (len != sizeof(rseq)) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); goto out; @@ -940,16 +944,27 @@ static int kerndat_has_ptrace_get_rseq_conf(void) * we need to pay attention to that and, possibly, make changes on the CRIU side. */ if (rseq.flags != 0) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); } else { + if (!kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = true; + + if (memcmp(&kdat.libc_rseq_conf, &rseq, sizeof(rseq))) + ret = 1; /* we should update kdat */ + + kdat.libc_rseq_conf = rseq; } out: kill(pid, SIGKILL); waitpid(pid, NULL, 0); - return 0; + return ret; } int kerndat_sockopt_buf_lock(void) @@ -1472,6 +1487,12 @@ int kerndat_try_load_new(void) if (ret < 0) return ret; + ret = kerndat_has_ptrace_get_rseq_conf(); + if (ret < 0) { + pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); + return ret; + } + /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); @@ -1657,7 +1678,7 @@ int kerndat_init(void) pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_ptrace_get_rseq_conf()) { + if (!ret && (kerndat_has_ptrace_get_rseq_conf() < 0)) { pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ret = -1; } From db9781ef06dc1b205ae4ef134dde056fad4c8c31 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Wed, 20 Jul 2022 15:17:35 +0300 Subject: [PATCH 054/122] cr-restore: rseq: use glibc-specific way to unregister only as fallback Let's use dynamic approach to detect built-in *libc rseq in all cases, and "old" static approach as a fallback path if the user kernel lacks support of ptrace_get_rseq_conf feature. Suggested-by: Florian Weimer Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 5b5b41dfc8..919d10ab57 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3088,7 +3088,6 @@ static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) return 0; } -#if defined(__GLIBC__) && defined(RSEQ_SIG) static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { if (!kdat.has_rseq) { @@ -3096,15 +3095,14 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) return; } - rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); - rseq->rseq_abi_size = __rseq_size; - rseq->signature = RSEQ_SIG; -} + if (!kdat.has_ptrace_get_rseq_conf) { +#if defined(__GLIBC__) && defined(RSEQ_SIG) + rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + rseq->rseq_abi_size = __rseq_size; + rseq->signature = RSEQ_SIG; #else -static void prep_libc_rseq_info(struct rst_rseq_param *rseq) -{ - if (!kdat.has_rseq || !kdat.has_ptrace_get_rseq_conf) { rseq->rseq_abi_pointer = 0; +#endif return; } @@ -3112,7 +3110,6 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; rseq->signature = kdat.libc_rseq_conf.signature; } -#endif static rlim_t decode_rlim(rlim_t ival) { From 620606703fdd614846ae1946e2f3712e9c88b888 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 9 Aug 2022 09:42:37 -0700 Subject: [PATCH 055/122] Add Alexander Mikhalitsyn to maintainers Alex implemented a few complex features and maintain our CI system. Signed-off-by: Andrei Vagin --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7d53d0dc1e..8fee8e5715 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5,3 +5,4 @@ Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov Radostin Stoyanov +Alexander Mikhalitsyn From 58fa2676369cba21f395c9a89168e3758cb51df9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 11 Aug 2022 09:51:34 +0100 Subject: [PATCH 056/122] docker-test: handle race condition error There is a race condition in docker/containerd that causes docker to occasionally fail when starting a container from a checkpoint immediately after the checkpoint has been created. This problem is unrelated to criu and has been reported in https://github.com/moby/moby/issues/42900 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 63941437e3..ca93ed77c3 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -75,17 +75,37 @@ checkpoint_container () { docker wait cr } -restore_container () { - CHECKPOINT_NAME=$1 - - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { +print_logs () { cat "$(grep log 'log file:' | sed 's/log file:\s*//')" || true docker logs cr || true cat $CRIU_LOG || true dmesg docker ps exit 1 - } +} + +declare -i max_restore_container_tries=3 +current_iteration= + +restore_container () { + CHECKPOINT_NAME=$1 + + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { + # FIXME: There is a race condition in docker/containerd that causes + # docker to occasionally fail when starting a container from a + # checkpoint immediately after the checkpoint has been created. + # https://github.com/moby/moby/issues/42900 + if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then + print_logs + fi + grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && { + ((current_iteration+=1)) + echo "Retry container restore: $current_iteration" + sleep 1; + restore_container "$CHECKPOINT_NAME" + } || + print_logs + } && current_iteration=0 } # Scenario: Create multiple containers and checkpoint and restore them once From 3019db31a53810a7e3478092847c42d33496558f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 30 Mar 2022 19:27:20 -0700 Subject: [PATCH 057/122] ci/cirrus: add CentOS Stream 9 Mostly a copy-paste from the CentOS 8 task, with a few differences: - Use dnf instead of yum - Enable crb instead of powertools - Different way of installing EPEL - No need to switch to python3 as this is the default - junit_xml is now available as an rpm Signed-off-by: Kir Kolyshkin --- .cirrus.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 2b6903ddc5..6a5d751494 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -19,6 +19,34 @@ task: build_script: | make -C scripts/ci vagrant-fedora-no-vdso +task: + name: CentOS Stream 9 based test + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: centos-cloud + image: family/centos-stream-9 + platform: linux + cpu: 4 + memory: 8G + + setup_script: | + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + dnf config-manager --set-enabled crb # Same as CentOS 8 powertools + dnf -y install epel-release epel-next-release + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python-flake8 xmlto + systemctl stop sssd + # Even with selinux in permissive mode the selinux tests will be executed. + # The Cirrus CI user runs as a service from selinux point of view and is + # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0). + # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode. + setenforce 0 + + build_script: | + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" + task: name: Vagrant Fedora Rawhide based test environment: From 24100795a3de6192cc2e17f15ceccd09145d71a4 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 18:19:41 -0700 Subject: [PATCH 058/122] ci/cirrus: centos 8 job nits 1. Rename CentOS 8 to CentOS Stream 8 (which it is). 2. Install junit_xml from the repo rather than via pip. Signed-off-by: Kir Kolyshkin --- .cirrus.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 6a5d751494..03ed797480 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -69,7 +69,7 @@ task: make -C scripts/ci vagrant-fedora-rawhide task: - name: CentOS 8 based test + name: CentOS Stream 8 based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" @@ -85,7 +85,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-junit_xml xmlto alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed @@ -93,7 +93,6 @@ task: # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode setenforce 0 - pip3 install junit_xml build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" From 373281f50dfa9069058aa0bfde05c4f4a40ddd8a Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 7 Aug 2022 16:27:22 -0700 Subject: [PATCH 059/122] compel: set TRACESYSGOOD to distinguish breakpoints from syscalls When delivering system call traps, set bit 7 in the signal number (i.e., deliver SIGTRAP|0x80). This makes it easy for the tracer to distinguish normal traps from those caused by a system call. Signed-off-by: Andrei Vagin --- compel/include/ptrace.h | 2 ++ compel/include/uapi/infect.h | 4 ++-- compel/src/lib/infect.c | 27 ++++++++++++++------------- compel/src/lib/ptrace.c | 2 +- criu/cr-restore.c | 13 ++++++++----- 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/compel/include/ptrace.h b/compel/include/ptrace.h index bf2701e632..00013f9370 100644 --- a/compel/include/ptrace.h +++ b/compel/include/ptrace.h @@ -5,6 +5,8 @@ #include #include +#define PTRACE_SYSCALL_TRAP 0x80 + #define PTRACE_SI_EVENT(_si_code) (((_si_code)&0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 7073f343f2..19d4da2b14 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -80,9 +80,9 @@ enum trace_flags { TRACE_EXIT, }; -extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat, enum trace_flags trace); +extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat); -extern int __must_check compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); +extern int __must_check compel_stop_pie(pid_t pid, void *addr, bool no_bp); extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index b99f23b360..7d78654805 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -304,6 +304,11 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ goto try_again; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } + if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; @@ -1366,7 +1371,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; - enum trace_flags flag; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) @@ -1407,11 +1411,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return -1; /* Go to sigreturn as closer as we can */ - ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ret = compel_stop_pie(pid, ctl->sigreturn_addr, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; - if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag)) + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; if (ptrace_flush_breakpoints(pid)) @@ -1546,7 +1550,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) if (ret) goto err; - ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1), TRACE_ENTER); + ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1)); /* * Don't touch extended registers here: they were restored @@ -1558,7 +1562,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) return ret; } -int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) +int compel_stop_pie(pid_t pid, void *addr, bool no_bp) { int ret; @@ -1575,7 +1579,6 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ - *tf = TRACE_EXIT; return 0; } @@ -1588,14 +1591,12 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) pr_perror("Unable to restart the %d process", pid); return -1; } - - *tf = TRACE_ENTER; return 0; } static bool task_is_trapped(int status, pid_t pid) { - if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) + if (WIFSTOPPED(status) && (WSTOPSIG(status) & ~PTRACE_SYSCALL_TRAP) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); @@ -1629,15 +1630,13 @@ static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ -int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, enum trace_flags trace) +int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) { + enum trace_flags trace = tasks > 1 ? TRACE_ALL : TRACE_ENTER; user_regs_struct_t regs; int status, ret; pid_t pid; - if (tasks > 1) - trace = TRACE_ALL; - /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); @@ -1651,6 +1650,8 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, pr_debug("%d was trapped\n", pid); + if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) + goto goon; if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c index 49b685d707..717ee28390 100644 --- a/compel/src/lib/ptrace.c +++ b/compel/src/lib/ptrace.c @@ -23,7 +23,7 @@ int ptrace_suspend_seccomp(pid_t pid) { - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD) < 0) { pr_perror("suspending seccomp failed"); return -1; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 919d10ab57..9a1b23999c 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1964,6 +1964,10 @@ static int attach_to_tasks(bool root_seized) return -1; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -2028,7 +2032,7 @@ static int restore_rseq_cs(void) return 0; } -static int catch_tasks(bool root_seized, enum trace_flags *flag) +static int catch_tasks(bool root_seized) { struct pstree_item *item; @@ -2058,7 +2062,7 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, flag, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); if (ret < 0) return -1; } @@ -2225,7 +2229,6 @@ static void reap_zombies(void) static int restore_root_task(struct pstree_item *init) { - enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item; @@ -2440,7 +2443,7 @@ static int restore_root_task(struct pstree_item *init) timing_stop(TIME_RESTORE); - if (catch_tasks(root_seized, &flag)) { + if (catch_tasks(root_seized)) { pr_err("Can't catch all tasks\n"); goto out_kill_network_unlocked; } @@ -2450,7 +2453,7 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_COMPLETE); - ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1)); if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; From 40f5d9b4592788f9975605625cc749dea7f2bec2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 7 Aug 2022 16:36:15 -0700 Subject: [PATCH 060/122] compel: clear a breakpoint right after it's been triggered Breakpoints are used to stop as close as possible to a target system call. First, we don't need it after this point. Second, PTRACE_CONT can't pass through a breakpoint on arm64. Signed-off-by: Andrei Vagin --- compel/src/lib/infect.c | 15 +++++++++++---- criu/cr-restore.c | 21 --------------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 7d78654805..6413a1860b 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1418,9 +1418,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; - if (ptrace_flush_breakpoints(pid)) - return -1; - /* * All signals are unblocked now. The kernel notifies about leaving * syscall before starting to deliver signals. All parasite code are @@ -1650,8 +1647,18 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) pr_debug("%d was trapped\n", pid); - if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) + if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) { + /* + * On some platforms such as ARM64, it is impossible to + * pass through a breakpoint, so let's clear it right + * after it has been triggered. + */ + if (ptrace_flush_breakpoints(pid)) { + pr_err("Unable to clear breakpoints\n"); + return -1; + } goto goon; + } if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9a1b23999c..9c480be789 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2071,24 +2071,6 @@ static int catch_tasks(bool root_seized) return 0; } -static int clear_breakpoints(void) -{ - struct pstree_item *item; - int ret = 0, i; - - if (fault_injected(FI_NO_BREAKPOINTS)) - return 0; - - for_each_pstree_item(item) { - if (!task_alive(item)) - continue; - for (i = 0; i < item->nr_threads; i++) - ret |= ptrace_flush_breakpoints(item->threads[i].real); - } - - return ret; -} - static void finalize_restore(void) { struct pstree_item *item; @@ -2459,9 +2441,6 @@ static int restore_root_task(struct pstree_item *init) goto out_kill_network_unlocked; } - if (clear_breakpoints()) - pr_err("Unable to flush breakpoints\n"); - finalize_restore(); /* * Some external devices such as GPUs might need a very late From 267c9bc55e689e37f425a2b7579c0fbd93a2394b Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Fri, 15 Apr 2022 13:00:04 +0800 Subject: [PATCH 061/122] compel: switch breakpoint functions to non-inline at arm64 platform Signed-off-by: fu.lin Signed-off-by: Andrei Vagin --- .../aarch64/src/lib/include/uapi/asm/breakpoints.h | 11 ++--------- compel/arch/aarch64/src/lib/infect.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 5f090490d9..796aec0160 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,14 +2,7 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} +int ptrace_set_breakpoint(pid_t pid, void *addr); +int ptrace_flush_breakpoints(pid_t pid); #endif diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index bd1ed0da35..316ff73e7b 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -176,3 +176,13 @@ unsigned long compel_task_size(void) break; return task_size; } + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} From cc8c6b4cd474821c61747b5d7e56c5b70e8de3bb Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Sun, 7 Aug 2022 16:52:39 -0700 Subject: [PATCH 062/122] breakpoint: implement hw breakpoint for arm64 platform The x86 implement hardware breakpoint to accelerate the tracing syscall procedure instead of `ptrace(PTRACE_SYSCALL)`. The arm64 has the same capability according to <>[[1]]. <[[2]] illustrates the usage detailly: - D2.8 Breakpoint Instruction exceptions - D2.9 Breakpoint exceptions - D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers, n Note: [1]: https://developer.arm.com/documentation/102120/0100 [2]: https://developer.arm.com/documentation/ddi0487/latest Signed-off-by: fu.lin Signed-off-by: Andrei Vagin --- .../src/lib/include/uapi/asm/breakpoints.h | 34 +++++++ compel/arch/aarch64/src/lib/infect.c | 91 ++++++++++++++++++- 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 796aec0160..8a61b268f8 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,6 +2,40 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT +#include +#include + +struct hwbp_cap { + char arch; + char bp_count; +}; + +/* copied from `linux/arch/arm64/include/asm/hw_breakpoint.h` */ +/* Lengths */ +#define ARM_BREAKPOINT_LEN_1 0x1 +#define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 +#define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f +#define ARM_BREAKPOINT_LEN_8 0xff + +/* Privilege Levels */ +#define AARCH64_BREAKPOINT_EL1 1 +#define AARCH64_BREAKPOINT_EL0 2 + +/* Breakpoint */ +#define ARM_BREAKPOINT_EXECUTE 0 + +/* Watchpoints */ +#define ARM_BREAKPOINT_LOAD 1 +#define ARM_BREAKPOINT_STORE 2 +#define AARCH64_ESR_ACCESS_MASK (1 << 6) + +#define DISABLE_HBP 0 +#define ENABLE_HBP 1 + int ptrace_set_breakpoint(pid_t pid, void *addr); int ptrace_flush_breakpoints(pid_t pid); diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 316ff73e7b..7b75da8907 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,7 +2,9 @@ #include #include #include +#include #include + #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" @@ -10,6 +12,7 @@ #include "errno.h" #include "infect.h" #include "infect-priv.h" +#include "asm/breakpoints.h" unsigned __page_size = 0; unsigned __page_shift = 0; @@ -177,12 +180,98 @@ unsigned long compel_task_size(void) return task_size; } +static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) +{ + static struct hwbp_cap info; + static int available = -1; + + if (available == -1) { + unsigned int val; + struct iovec iovec = { + .iov_base = &val, + .iov_len = sizeof(val), + }; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_HW_BREAK, &iovec) < 0) + available = 0; + else { + info.arch = (char)((val >> 8) & 0xff); + info.bp_count = (char)(val & 0xff); + + available = (info.arch != 0); + } + } + + return available == 1 ? &info : NULL; +} + int ptrace_set_breakpoint(pid_t pid, void *addr) { - return 0; + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + /* + * The struct is copied from `arch/arm64/include/asm/hw_breakpoint.h` in + * linux kernel: + * struct arch_hw_breakpoint_ctrl { + * __u32 __reserved : 19, + * len : 8, + * type : 2, + * privilege : 2, + * enabled : 1; + * }; + * + * The part of `struct arch_hw_breakpoint_ctrl` bits meaning is defined + * in <>, + * D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers. + */ + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | ENABLE_HBP; + regs.dbg_regs[0].addr = (__u64)addr; + regs.dbg_regs[0].ctrl = ctrl; + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { + pr_perror("Unable to restart the stopped tracee process %d", pid); + return -1; + } + + return 1; } int ptrace_flush_breakpoints(pid_t pid) { + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | DISABLE_HBP; + regs.dbg_regs[0].addr = 0ul; + regs.dbg_regs[0].ctrl = ctrl; + + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + return 0; } From ec49f42018dfb4beb6620bbaf6623e2c7cc4a5c7 Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Tue, 9 Aug 2022 12:18:00 -0700 Subject: [PATCH 063/122] breakpoint: enable breakpoints by default on amd64 and arm64 Signed-off-by: fu.lin Signed-off-by: Andrei Vagin --- compel/arch/aarch64/src/lib/infect.c | 12 ++++++++++++ compel/arch/x86/src/lib/infect.c | 11 +++++++++++ criu/include/fault-injection.h | 8 -------- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 7b75da8907..d0189f0039 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -207,6 +207,7 @@ static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) int ptrace_set_breakpoint(pid_t pid, void *addr) { + k_rtsigset_t block; struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); struct user_hwdebug_state regs = {}; unsigned int ctrl = 0; @@ -242,6 +243,17 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) return -1; + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } + if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { pr_perror("Unable to restart the stopped tracee process %d", pid); return -1; diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index c0e7a544a0..01959b95b2 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -588,6 +588,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) int ptrace_set_breakpoint(pid_t pid, void *addr) { + k_rtsigset_t block; int ret; /* Set a breakpoint */ @@ -603,6 +604,16 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) return -1; } + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index f33918de86..69d670be93 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -24,14 +24,6 @@ enum faults { static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { - /* - * Temporary workaround for Xen guests. Breakpoints degrade - * performance linearly, so until we find out the reason, - * let's disable them. - */ - if (f == FI_NO_BREAKPOINTS) - return true; - return fi_strategy == f; } From 6e35c5922e4dc689ff6f6ee5eba8e1013a875e8a Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 6 Apr 2022 18:35:26 +0200 Subject: [PATCH 064/122] criu: fail migration if data was sent to an in-flight socket Before this change, CRIU would just lose that data upon migration. So it's better to fail migration in this case. To reproduce the bug one can: 1. Create an AF_UNIX socket and call listen on it. 2. Create a second AF_UNIX socket and call connect to the first one. 3. Send the data to the second socket. 4. Migrate. 5. Call accept on the first socket and then read. There would be no data available. It should be even possible to close the second socket before migration. This would cause accept to hang because CRIU totally misses a closed in-flight socket. Signed-off-by: Michal Clapinski --- criu/sk-unix.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 47e1b2962a..873360bfad 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -497,9 +497,34 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) goto err; } + if (sk->wqlen != 0) { + /* + * There's no known way to get data out of the write + * queue of an icon socket. The only good solution for + * now is to fail the migration. + */ + pr_err("Non-empty write queue on an in-flight socket %#x\n", ue->ino); + goto err; + } + ue->peer = e->sk_desc->sd.ino; pr_debug("\t\tFixed inflight socket %u peer %u)\n", ue->ino, ue->peer); + } else if (ue->state == TCP_LISTEN) { + int i; + + for (i = 0; i < sk->nr_icons; i++) + if (sk->icons[i] == 0) { + /* + * Inode of an icon socket equal to 0 means + * it's already been closed. That means we have + * no simple way to check if it sent any data. + * The only good solution for now is to fail + * the migration. + */ + pr_err("Found a closed in-flight socket to %#x\n", ue->ino); + goto err; + } } dump: if (dump_socket_opts(lfd, skopts)) From edb3e522657971caa7efe0050bb58d6fe2969109 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Apr 2022 15:46:34 -0700 Subject: [PATCH 065/122] zdtm: return 1 from pr_err, pr_perror, fail This allows to make test code more compact: if (ret == -1) { pr_perror("XXX"); return 1; } vs if (ret == -1) return pr_perror("XXX"); Signed-off-by: Andrei Vagin --- test/zdtm/lib/zdtmtst.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index ed7c23ee26..d91886d258 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -126,11 +126,25 @@ extern int write_pidfile(int pid); /* message helpers */ extern int test_log_init(const char *outfile, const char *suffix); extern int zdtm_seccomp; -#define pr_err(format, arg...) test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg) -#define pr_perror(format, arg...) \ - test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#define fail(format, arg...) \ - test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#define pr_err(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg); \ + 1; \ + }) + +#define pr_perror(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) + +#define fail(format, arg...) \ + ({ \ + test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) #define skip(format, arg...) test_msg("SKIP: %s:%d: " format "\n", __FILE__, __LINE__, ##arg) #define pass() test_msg("PASS\n") From 309e1315fb675e69fa6d1a39eddb97f196cb5296 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Apr 2022 15:41:25 -0700 Subject: [PATCH 066/122] test/unix: check C/R of unix listen queues Check that CRIU handles non-empty listen queues properly. Signed-off-by: Andrei Vagin [mclapinski@google.com: update test_doc and test_author] Signed-off-by: Michal Clapinski --- test/zdtm/static/Makefile | 8 ++ test/zdtm/static/sk-unix-listen01.c | 117 +++++++++++++++++++++++++ test/zdtm/static/sk-unix-listen02.c | 1 + test/zdtm/static/sk-unix-listen02.desc | 1 + test/zdtm/static/sk-unix-listen03.c | 1 + test/zdtm/static/sk-unix-listen03.desc | 1 + test/zdtm/static/sk-unix-listen04.c | 1 + test/zdtm/static/sk-unix-listen04.desc | 1 + 8 files changed, 131 insertions(+) create mode 100644 test/zdtm/static/sk-unix-listen01.c create mode 120000 test/zdtm/static/sk-unix-listen02.c create mode 100644 test/zdtm/static/sk-unix-listen02.desc create mode 120000 test/zdtm/static/sk-unix-listen03.c create mode 100644 test/zdtm/static/sk-unix-listen03.desc create mode 120000 test/zdtm/static/sk-unix-listen04.c create mode 100644 test/zdtm/static/sk-unix-listen04.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index b28345400c..0ac22731b3 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -352,6 +352,10 @@ TST_FILE = \ socket_close_data01 \ fifo_upon_unix_socket00 \ fifo_upon_unix_socket01 \ + sk-unix-listen01 \ + sk-unix-listen02 \ + sk-unix-listen03 \ + sk-unix-listen04 \ TST_DIR = \ cwd00 \ @@ -670,6 +674,10 @@ bpf_array: LDLIBS += -lbpf fifo_upon_unix_socket01: CFLAGS += -DFIFO_UPON_UNIX01 +sk-unix-listen02: CFLAGS += -DSK_UNIX_LISTEN02 +sk-unix-listen03: CFLAGS += -DSK_UNIX_LISTEN03 +sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/sk-unix-listen01.c b/test/zdtm/static/sk-unix-listen01.c new file mode 100644 index 0000000000..5c9274acb0 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen01.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test in-flight unix sockets with data in them\n"; +const char *test_author = "Andrei Vagin "; + +#define SK_DATA "packet" + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +#define TEST_MODE 0640 + +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + +int main(int argc, char *argv[]) +{ + struct sockaddr_un addr; + unsigned int addrlen; + int ssk, sk; + + char path[PATH_MAX]; + char *cwd; + int ret; + + test_init(argc, argv); + + cwd = get_current_dir_name(); + if (!cwd) + return pr_perror("get_current_dir_name"); + + snprintf(path, sizeof(path), "%s/%s", cwd, filename); + unlink(path); + + addr.sun_family = AF_UNIX; + addrlen = strlen(filename); + if (addrlen > sizeof(addr.sun_path)) + return pr_err("address is too long"); + memcpy(addr.sun_path, filename, addrlen); + addrlen += sizeof(addr.sun_family); + + ssk = socket(AF_UNIX, SOCK_TYPE, 0); + if (ssk == -1) + return pr_perror("socket"); + + sk = socket(AF_UNIX, SOCK_TYPE, 0); + if (sk < 0) + return pr_perror("socket"); + + ret = bind(ssk, (struct sockaddr *)&addr, addrlen); + if (ret) + return pr_perror("bind"); + + ret = listen(ssk, 16); + if (ret) + return pr_perror("listen"); + + if (connect(sk, (struct sockaddr *)&addr, addrlen)) + return pr_perror("connect"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + memset(buf, 0, sizeof(buf)); + write(sk, SK_DATA, sizeof(SK_DATA)); + } +#endif + +#ifdef SK_UNIX_LISTEN03 + close(sk); + sk = -1; +#endif + + test_daemon(); + test_waitsig(); + + if (sk != -1) + close(sk); + + ret = accept(ssk, NULL, NULL); + if (ret < 0) + return fail("accept"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + if (read(ret, &buf, sizeof(buf)) != sizeof(SK_DATA)) + return pr_perror("read"); + + if (strcmp(buf, SK_DATA)) + return fail("data corrupted"); + } +#endif + + close(ssk); + unlink(path); + + pass(); + return 0; +} diff --git a/test/zdtm/static/sk-unix-listen02.c b/test/zdtm/static/sk-unix-listen02.c new file mode 120000 index 0000000000..1211f46660 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen02.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen02.desc b/test/zdtm/static/sk-unix-listen02.desc new file mode 100644 index 0000000000..ded89879a9 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen02.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-listen03.c b/test/zdtm/static/sk-unix-listen03.c new file mode 120000 index 0000000000..1211f46660 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen03.desc b/test/zdtm/static/sk-unix-listen03.desc new file mode 100644 index 0000000000..ded89879a9 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-listen04.c b/test/zdtm/static/sk-unix-listen04.c new file mode 120000 index 0000000000..1211f46660 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen04.desc b/test/zdtm/static/sk-unix-listen04.desc new file mode 100644 index 0000000000..ded89879a9 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} From 3aafc55e90a256fec535e66ef52b0fde55514dc6 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 20:07:25 -0700 Subject: [PATCH 067/122] gitignore: Ignore top-evel build dir only The entry "build/" will ignore any directory named "build" at any level of the source tree, including our scripts/build directory. We only want to ignore the top-level build directory created by `make install`. As the git manpage suggests, entries with slashes at the start or in the middle will only match at the same level as the .gitignore, hence use build/** instead. Signed-off-by: Younes Manton --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d5135f5f8d..23894d631b 100644 --- a/.gitignore +++ b/.gitignore @@ -42,4 +42,4 @@ lib/.crit-setup.files compel/include/asm include/common/asm include/common/config.h -build/ +build/** From 84a72696500c02c1b16ff3764ed57b8d0502a093 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 19:48:52 -0700 Subject: [PATCH 068/122] ci: Rename openj9 Dockerfiles to hotspot We used to pull AdoptOpenJDK's OpenJ9 builds but switched to Eclipse Temurin, which uses the HotSpot VM instead of OpenJ9. Rename the corresponding Dockerfiles to hotspot. Signed-off-by: Younes Manton --- .../{Dockerfile.openj9-alpine => Dockerfile.hotspot-alpine} | 5 +---- .../{Dockerfile.openj9-ubuntu => Dockerfile.hotspot-ubuntu} | 0 2 files changed, 1 insertion(+), 4 deletions(-) rename scripts/build/{Dockerfile.openj9-alpine => Dockerfile.hotspot-alpine} (69%) rename scripts/build/{Dockerfile.openj9-ubuntu => Dockerfile.hotspot-ubuntu} (100%) diff --git a/scripts/build/Dockerfile.openj9-alpine b/scripts/build/Dockerfile.hotspot-alpine similarity index 69% rename from scripts/build/Dockerfile.openj9-alpine rename to scripts/build/Dockerfile.hotspot-alpine index f92011283c..d6e6e51308 100644 --- a/scripts/build/Dockerfile.openj9-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,6 +1,4 @@ -# FIXME: Replace with eclipse-temurin once Alpine support has been added. -# https://github.com/adoptium/containers/pull/60 -FROM adoptopenjdk/openjdk8-openj9:alpine +FROM docker.io/library/eclipse-temurin:8-alpine ARG CC=gcc RUN apk update && apk add \ @@ -29,4 +27,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu similarity index 100% rename from scripts/build/Dockerfile.openj9-ubuntu rename to scripts/build/Dockerfile.hotspot-ubuntu From 8556d83e8e659cd20eb70f4d010af61949fb177e Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 20:09:50 -0700 Subject: [PATCH 069/122] ci: Add Dockerfile for openj9 on Ubuntu Semeru builds (which use OpenJ9 instead of HotSpot) are the successors of AdoptOpenJDK's OpenJ9 builds. Signed-off-by: Younes Manton --- scripts/build/Dockerfile.openj9-ubuntu | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 scripts/build/Dockerfile.openj9-ubuntu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu new file mode 100644 index 0000000000..2e35358ff5 --- /dev/null +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -0,0 +1,33 @@ +FROM docker.io/library/ibm-semeru-runtimes:open-8-jdk-focal +ARG CC=gcc + +COPY scripts/ci/apt-install /bin/apt-install + +RUN apt-install protobuf-c-compiler \ + libprotobuf-c-dev \ + libaio-dev \ + python3-future \ + libprotobuf-dev \ + protobuf-compiler \ + libcap-dev \ + libnl-3-dev \ + gdb \ + bash \ + python3-protobuf \ + python3-yaml \ + libnet-dev \ + libnl-route-3-dev \ + libbsd-dev \ + make \ + git \ + pkg-config \ + iptables \ + gcc \ + maven + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && make -j $(nproc) CC="$CC" + +ENTRYPOINT mvn -q -f test/javaTests/pom.xml test From 1ba1c39de4adc34f84cf05303bff201fd6f0fcda Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 21:17:06 -0700 Subject: [PATCH 070/122] ci: Clean up and improve Java testing This patch changes top-level OpenJ9 filename and data references to Java to make them generic and launches tests against both HotSpot and OpenJ9 JVMs. Signed-off-by: Younes Manton --- .../{openj9-test.yml => java-test.yml} | 6 ++--- scripts/ci/Makefile | 4 +-- scripts/ci/java-test.sh | 25 +++++++++++++++++++ scripts/ci/openj9-test.sh | 20 --------------- 4 files changed, 30 insertions(+), 25 deletions(-) rename .github/workflows/{openj9-test.yml => java-test.yml} (54%) create mode 100755 scripts/ci/java-test.sh delete mode 100755 scripts/ci/openj9-test.sh diff --git a/.github/workflows/openj9-test.yml b/.github/workflows/java-test.yml similarity index 54% rename from .github/workflows/openj9-test.yml rename to .github/workflows/java-test.yml index 1d7a1eb6b7..211953495b 100644 --- a/.github/workflows/openj9-test.yml +++ b/.github/workflows/java-test.yml @@ -1,4 +1,4 @@ -name: OpenJ9 Test +name: Java Test on: [push, pull_request] @@ -7,5 +7,5 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - - name: Run OpenJ9 Test - run: sudo make -C scripts/ci openj9-test + - name: Run Java Test + run: sudo make -C scripts/ci java-test diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 120f561e48..3a1634fb8b 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -85,8 +85,8 @@ podman-test: # overlayfs behaves differently on Ubuntu and breaks CRIU # https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 # Switch to devicemapper -openj9-test: restart-docker - ./openj9-test.sh +java-test: restart-docker + ./java-test.sh setup-vagrant: ./vagrant.sh setup diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh new file mode 100755 index 0000000000..7cf704f074 --- /dev/null +++ b/scripts/ci/java-test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cd ../.. || exit 1 + +failures="" + +docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . +if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then + failures="$failures openj9-ubuntu" +fi + +docker build -t criu-hotspot-alpine-test:latest -f scripts/build/Dockerfile.hotspot-alpine . +if ! docker run --rm --privileged criu-hotspot-alpine-test:latest; then + failures="$failures hotspot-alpine" +fi + +docker build -t criu-hotspot-ubuntu-test:latest -f scripts/build/Dockerfile.hotspot-ubuntu . +if ! docker run --rm --privileged criu-hotspot-ubuntu-test:latest; then + failures="$failures hotspot-ubuntu" +fi + +if [ -n "$failures" ]; then + echo "Tests failed on $failures" + exit 1 +fi diff --git a/scripts/ci/openj9-test.sh b/scripts/ci/openj9-test.sh deleted file mode 100755 index b8c07f1802..0000000000 --- a/scripts/ci/openj9-test.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -cd ../.. || exit 1 - -failures="" - -docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . -if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then - failures="$failures ubuntu" -fi - -docker build -t criu-openj9-alpine-test:latest -f scripts/build/Dockerfile.openj9-alpine . -if ! docker run --rm --privileged criu-openj9-alpine-test:latest; then - failures="$failures alpine" -fi - -if [ -n "$failures" ]; then - echo "Tests failed on $failures" - exit 1 -fi From 517c0947050e63aac72f63a3bf373d76264723b9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 24 Aug 2022 21:20:30 +0200 Subject: [PATCH 071/122] mount: add definition for FSOPEN_CLOEXEC A recent change in glibc introduced `enum fsconfig_command` [1] and as a result the compilation of criu fails with the following errors In file included from criu/pie/util.c:3: /usr/include/sys/mount.h:240:6: error: redeclaration of 'enum fsconfig_command' 240 | enum fsconfig_command | ^~~~~~~~~~~~~~~~ In file included from /usr/include/sys/mount.h:32: criu/include/linux/mount.h:11:6: note: originally defined here 11 | enum fsconfig_command { | ^~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:242:3: error: redeclaration of enumerator 'FSCONFIG_SET_FLAG' 242 | FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ | ^~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:12:9: note: previous definition of 'FSCONFIG_SET_FLAG' with type 'enum fsconfig_command' 12 | FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ | ^~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:244:3: error: redeclaration of enumerator 'FSCONFIG_SET_STRING' 244 | FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ | ^~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:14:9: note: previous definition of 'FSCONFIG_SET_STRING' with type 'enum fsconfig_command' 14 | FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ | ^~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:246:3: error: redeclaration of enumerator 'FSCONFIG_SET_BINARY' 246 | FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ | ^~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:16:9: note: previous definition of 'FSCONFIG_SET_BINARY' with type 'enum fsconfig_command' 16 | FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ | ^~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:248:3: error: redeclaration of enumerator 'FSCONFIG_SET_PATH' 248 | FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ | ^~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:18:9: note: previous definition of 'FSCONFIG_SET_PATH' with type 'enum fsconfig_command' 18 | FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ | ^~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:250:3: error: redeclaration of enumerator 'FSCONFIG_SET_PATH_EMPTY' 250 | FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ | ^~~~~~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:20:9: note: previous definition of 'FSCONFIG_SET_PATH_EMPTY' with type 'enum fsconfig_command' 20 | FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ | ^~~~~~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:252:3: error: redeclaration of enumerator 'FSCONFIG_SET_FD' 252 | FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ | ^~~~~~~~~~~~~~~ criu/include/linux/mount.h:22:9: note: previous definition of 'FSCONFIG_SET_FD' with type 'enum fsconfig_command' 22 | FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ | ^~~~~~~~~~~~~~~ /usr/include/sys/mount.h:254:3: error: redeclaration of enumerator 'FSCONFIG_CMD_CREATE' 254 | FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ | ^~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:24:9: note: previous definition of 'FSCONFIG_CMD_CREATE' with type 'enum fsconfig_command' 24 | FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ | ^~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:256:3: error: redeclaration of enumerator 'FSCONFIG_CMD_RECONFIGURE' 256 | FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ | ^~~~~~~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:26:9: note: previous definition of 'FSCONFIG_CMD_RECONFIGURE' with type 'enum fsconfig_command' 26 | FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ This patch adds definition for FSOPEN_CLOEXEC to solve this problem. In particular, sys/mount.h includes ifndef check for FSOPEN_CLOEXEC surrounding `enum fsconfig_command`. [1] https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=7eae6a91e9b1670330c9f15730082c91c0b1d570 Reported-by: Younes Manton (@ymanton) Signed-off-by: Radostin Stoyanov --- criu/include/linux/mount.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index 0d55a588cb..fefafa89e1 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -6,7 +6,7 @@ /* Copied from /usr/include/sys/mount.h */ -#ifndef FSCONFIG_CMD_CREATE +#ifndef FSOPEN_CLOEXEC /* The type of fsconfig call made. */ enum fsconfig_command { FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ @@ -26,7 +26,13 @@ enum fsconfig_command { FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ #define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; -#endif // FSCONFIG_CMD_CREATE + +#endif // FSOPEN_CLOEXEC + +/* fsopen flags. With the redundant definition, we check if the kernel, + * glibc value and our value still match. + */ +#define FSOPEN_CLOEXEC 0x00000001 #ifndef MS_MGC_VAL /* Magic mount flag number. Has to be or-ed to the flag values. */ From 94bfff77edac467b69b787f7aaf0ba4fac1feb95 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 22 Jun 2022 12:12:07 +0300 Subject: [PATCH 072/122] criu-ns: capture controlling tty When we are restoring in new pidns we specifically do setsid() from criu-ns init so that sids of restored tasks are non-zero in this pidns and on next dump CRIU would not have problems with zero sids, see [1]. But after this CRIU tries to inherit and setup a tty for the restored process, and it fails to set it's process group via TIOCSPGRP to be a foreground group for it's tty, because tty already is a controlling tty for other session (which we had before setsid). So to make it restore we need to reset tty to be a controlling tty of criu-ns init via TIOCSCTTY before calling criu. Else when restoring first time via criu-ns (from criu-ns dump) we get: Error (criu/tty.c:689): tty: Failed to set group 40816 on 0: Inappropriate ioctl for device https://github.com/checkpoint-restore/criu/issues/232 [1] v2: add why and what comment in code, set controlling tty only for --shell-job and fail if stdin is not a tty. Fixes: #1893 Signed-off-by: Pavel Tikhomirov --- scripts/criu-ns | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/criu-ns b/scripts/criu-ns index 1217c3dcdf..d51e7772c0 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -4,6 +4,8 @@ import ctypes.util import errno import sys import os +import fcntl +import termios # constants for unshare CLONE_NEWNS = 0x00020000 @@ -124,6 +126,16 @@ def wrap_restore(): criu_pid = os.fork() if criu_pid == 0: os.setsid() + # Set stdin tty to be a controlling tty of our new session, this is + # required by --shell-job option, as for it CRIU would try to set a + # process group of restored root task to be a foreground group on the + # terminal. + if '--shell-job' in restore_args or '-j' in restore_args: + if os.isatty(sys.stdin.fileno()): + fcntl.ioctl(sys.stdin.fileno(), termios.TIOCSCTTY, 1) + else: + raise OSError(errno.EINVAL, 'The stdin is not a tty for a --shell-job') + _mount_new_proc() run_criu(restore_args) From 2666eec7bbb34e3d9099a46bf12fd8e6c124179e Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 22 Jun 2022 13:09:20 +0300 Subject: [PATCH 073/122] files-reg: skip failed mount lookup for shell-job's tty When we restore a shell-job we would inherit tty-s, so even if we don't have a right mount for it in container on dump, on restore it should just be right. Else when dumping second time via criu-ns we get: (00.005678) Error (criu/files-reg.c:1710): Can't lookup mount=29 for fd=0 path=/dev/pts/20 Fixes: #1893 Signed-off-by: Pavel Tikhomirov --- criu/files-reg.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index c3761b5ed7..2e3d57c5ef 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -14,6 +14,8 @@ #include #include +#include "tty.h" + #ifndef SEEK_DATA #define SEEK_DATA 3 #define SEEK_HOLE 4 @@ -1689,6 +1691,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) int ret; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; + bool skip_for_shell_job = false; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) @@ -1708,11 +1711,15 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) mi = lookup_mnt_id(p->mnt_id); if (mi == NULL) { - pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); - return -1; + if (opts.shell_job && is_tty(p->stat.st_rdev, p->stat.st_dev)) { + skip_for_shell_job = true; + } else { + pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); + return -1; + } } - if (mnt_is_overmounted(mi)) { + if (!skip_for_shell_job && mnt_is_overmounted(mi)) { pr_err("Open files on overmounted mounts are not supported yet\n"); return -1; } @@ -1732,7 +1739,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) return -1; } - if (check_path_remap(link, p, lfd, id, mi->nsid)) + if (!skip_for_shell_job && check_path_remap(link, p, lfd, id, mi->nsid)) return -1; rfe.name = &link->name[1]; ext: From c056f99855db587e2563883c690c55d73439039c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 18:09:49 -0700 Subject: [PATCH 074/122] ci/gha/lint: install a recent shellcheck Instead of using shellcheck v0.7.2 from fedora repo, let's install the latest version (v0.8.0). This allows to remove some "shellcheck disable=..." annotations, and (I hope) better checking quality overall. While at it, remove findutils from dnf install as this package is already installed. Signed-off-by: Kir Kolyshkin --- .github/workflows/lint.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index d32403d052..3d42f3dcf0 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,7 +9,18 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell git-clang-format + run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format + + # TODO: remove this and use ShellCheck from repo once F37 with ShellCheck 0.8.0 is out. + - name: install shellcheck + env: + VERSION: v0.8.0 + BASEURL: https://github.com/koalaman/shellcheck/releases/download + SHA256: f4bce23c11c3919c1b20bcb0f206f6b44c44e26f2bc95f8aa708716095fa0651 + run: | + curl -sSfL --retry 5 $BASEURL/$VERSION/shellcheck-$VERSION.linux.x86_64.tar.xz | + tar xfJ - -C /usr/local/bin --strip 1 shellcheck-$VERSION/shellcheck + sha256sum --strict --check - <<<"$SHA256 /usr/local/bin/shellcheck" - uses: actions/checkout@v2 From 01e643a7778138c709f29391f91c16e8f95ba681 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 16:06:18 -0700 Subject: [PATCH 075/122] scripts/ci/apt-install: fix (not ignore) shellcheck warning It is ok to quote $@, as it expands to "$1" "$2" ... Signed-off-by: Kir Kolyshkin --- scripts/ci/apt-install | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/ci/apt-install b/scripts/ci/apt-install index 5a790901aa..45aca13f40 100755 --- a/scripts/ci/apt-install +++ b/scripts/ci/apt-install @@ -15,8 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - # shellcheck disable=SC2068 - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends $@ && break + apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" From 527a4ce97f6348035fc62faff9ade018f0e83db8 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 16:07:56 -0700 Subject: [PATCH 076/122] scripts/ci/asan.sh: fix, not ignore, shellcheck warning We can use globstar bash feature instead of find in this case. Signed-off-by: Kir Kolyshkin --- scripts/ci/asan.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index 8113b9b195..deeeca0b9d 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -1,7 +1,5 @@ #!/bin/bash -# shellcheck disable=2044 - set -x cat /proc/self/mountinfo @@ -13,7 +11,8 @@ chmod 0777 test/zdtm/static ./test/zdtm.py run -a --keep-going -k always --parallel 4 -x zdtm/static/rtc "$@" ret=$? -for i in $(find / -name 'asan.log*'); do +shopt -s globstar nullglob +for i in /**/asan.log*; do echo "$i" echo ======================================== cat "$i" From 06e1cad3a803c2c1fa20987f592e81d18923d0e8 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 16:40:31 -0700 Subject: [PATCH 077/122] Fix, not ignore, shellcheck SC1091 warnings This is easy to fix (but we have to specify -x). Signed-off-by: Kir Kolyshkin --- Makefile | 8 ++++---- scripts/ci/docker-test.sh | 3 ++- test/others/config-file/run.sh | 2 +- test/others/crit/test.sh | 3 ++- test/others/criu-coredump/test.sh | 2 +- test/others/libcriu/run.sh | 2 +- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index f8d44626b5..6bb1497b37 100644 --- a/Makefile +++ b/Makefile @@ -433,10 +433,10 @@ lint: shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install - shellcheck test/others/crit/*.sh - shellcheck test/others/libcriu/*.sh - shellcheck test/others/crit/*.sh test/others/criu-coredump/*.sh - shellcheck test/others/config-file/*.sh + shellcheck -x test/others/crit/*.sh + shellcheck -x test/others/libcriu/*.sh + shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh + shellcheck -x test/others/config-file/*.sh codespell # Do not append \n to pr_perror or fail ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index ca93ed77c3..eacfe136e6 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -1,6 +1,6 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2015 +# shellcheck disable=SC2015 set -x -e -o pipefail @@ -19,6 +19,7 @@ add-apt-repository \ ./apt-install docker-ce +# shellcheck source=/dev/null . /etc/lsb-release # docker checkpoint and restore is an experimental feature diff --git a/test/others/config-file/run.sh b/test/others/config-file/run.sh index 92195883e5..26b835b45e 100755 --- a/test/others/config-file/run.sh +++ b/test/others/config-file/run.sh @@ -11,7 +11,7 @@ set -xbm -#shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh if [ ! -d /etc/criu ]; then diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 0d38043d7a..7db88e0a90 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -1,8 +1,9 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2002 +# shellcheck disable=SC2002 set -x +# shellcheck source=test/others/env.sh source ../env.sh images_list="" diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index dd774e298b..9b6e564755 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -1,7 +1,7 @@ #!/bin/bash set -x -# shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh || exit 1 function gen_imgs { diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 77bdfb87eb..f7d363aabe 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -9,7 +9,7 @@ TEST_LOG="${TEST_DIR}/test.log" DUMP_LOG="${TEST_DIR}/dump.log" RESTORE_LOG="${TEST_DIR}/restore.log" -# shellcheck disable=1091 +# shellcheck source=test/others/env.sh source "${MAIN_DIR}/../env.sh" || exit 1 echo "== Clean" From 0fce00fa10f6b81ac5c7c88972b7635e853a6211 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 17:07:43 -0700 Subject: [PATCH 078/122] scripts/ci/run-ci-tests: use bash arrays This is a preferred way of fixing SC2086 shellcheck warning. Note that since ZDTM_OPTS is passed as a string (via make or docker), we are converting it to an array using read -a. Remove all "shellcheck disable=SC2086" annotations. Signed-off-by: Kir Kolyshkin --- scripts/ci/run-ci-tests.sh | 57 +++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 3760a65e3d..1b761ea563 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,14 +1,17 @@ #!/bin/bash set -x -e -CI_PKGS="protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev +CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml libperl-dev pkg-config python3-future python3-protobuf - python3-junit.xml" + python3-junit.xml) -X86_64_PKGS="gcc-multilib" +X86_64_PKGS=(gcc-multilib) + +# Convert from string to array. +IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) @@ -46,14 +49,14 @@ ci_prep () { else CC=gcc fi - CI_PKGS="$CI_PKGS $CC" + CI_PKGS+=("$CC") # Do not install x86_64 specific packages on other architectures if [ "$UNAME_M" = "x86_64" ]; then - CI_PKGS="$CI_PKGS $X86_64_PKGS" + CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "$CI_PKGS" + scripts/ci/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" # zdtm uses an unversioned python binary to run the tests. @@ -69,9 +72,8 @@ test_stream() { # restorer and eventually close the page read. However, image-streamer expects the # whole image to be read and the image is not reopened, sent twice. These MAP_HUGETLB # test cases will result in EPIPE error at the moment. - STREAM_TEST_EXCLUDE="-x maps09 -x maps10" - # shellcheck disable=SC2086 - ./test/zdtm.py run --stream -p 2 --keep-going -a $STREAM_TEST_EXCLUDE $ZDTM_OPTS + STREAM_TEST_EXCLUDE=(-x maps09 -x maps10) + ./test/zdtm.py run --stream -p 2 --keep-going -a "${STREAM_TEST_EXCLUDE[@]}" "${ZDTM_OPTS[@]}" } print_header() { @@ -160,21 +162,20 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then # for 32-bit tests. A better way would involve launching docker.. # But it would require making zdtm.py aware of docker and launching # tests inside the CT. - INCOMPATIBLE_LIBS="libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev" - IA32_PKGS="" + INCOMPATIBLE_LIBS=(libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev) + IA32_PKGS=() REFUGE=64-refuge mkdir "$REFUGE" - for i in $INCOMPATIBLE_LIBS ; do + for i in "${INCOMPATIBLE_LIBS[@]}" ; do for j in $(dpkg --listfiles "$i" | grep '\.so$') ; do cp "$j" "$REFUGE/" done - IA32_PKGS="$IA32_PKGS $i:i386" + IA32_PKGS+=("$i:i386") done - # shellcheck disable=SC2086 - apt-get remove $INCOMPATIBLE_LIBS + apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "$IA32_PKGS" + scripts/ci/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi @@ -211,15 +212,12 @@ if [ "${STREAM_TEST}" = "1" ]; then exit 0 fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going "${ZDTM_OPTS[@]}" if criu/criu check --feature move_mount_set_group; then - # shellcheck disable=SC2086 - ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going $ZDTM_OPTS + ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${ZDTM_OPTS[@]}" fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going --criu-config $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going --criu-config "${ZDTM_OPTS[@]}" # Newer kernels are blocking access to userfaultfd: # uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults must be handled without obtaining CAP_SYS_PTRACE capability @@ -227,17 +225,14 @@ if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then echo 1 > /proc/sys/vm/unprivileged_userfaultfd fi -LAZY_EXCLUDE="-x maps04 -x cmdlinenv00 -x maps007" +LAZY_EXCLUDE=(-x maps04 -x cmdlinenv00 -x maps007) LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*' -LAZY_OPTS="-p 2 -T $LAZY_TESTS $LAZY_EXCLUDE $ZDTM_OPTS" - -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls +LAZY_OPTS=(-p 2 -T "$LAZY_TESTS" "${LAZY_EXCLUDE[@]}" "${ZDTM_OPTS[@]}") + +./test/zdtm.py run "${LAZY_OPTS[@]}" --lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages --tls bash -x ./test/jenkins/criu-fault.sh if [ "$UNAME_M" == "x86_64" ]; then From 72d27e9818638b3f8762a713ac2ab840afcb152f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 17:10:27 -0700 Subject: [PATCH 079/122] scripts/ci: rm shellcheck disable annotations Those are no longer needed with shellcheck 0.8.0. Signed-off-by: Kir Kolyshkin --- scripts/ci/docker-test.sh | 2 -- scripts/ci/podman-test.sh | 1 - 2 files changed, 3 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index eacfe136e6..beb7da6da6 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -1,7 +1,5 @@ #!/bin/bash -# shellcheck disable=SC2015 - set -x -e -o pipefail ./apt-install \ diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 973d2d722a..e08fdf3bc5 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -27,7 +27,6 @@ rm -rf "${tmp_dir}" podman info -# shellcheck disable=SC2016 podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' sleep 1 From ebe87704316990613dd765e19786e62948ccd80e Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 17:49:50 -0700 Subject: [PATCH 080/122] scripts/protobuf-gen.sh: fix (not ignore) shellcheck warnings This basically replaces for x in $(sed ...); do with sed ... | while IFS= read -r x; do The only caveat is, sed program was amended to remove empty lines (there was one right above the PB_AUTOGEN_STOP). Signed-off-by: Kir Kolyshkin --- scripts/protobuf-gen.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/protobuf-gen.sh b/scripts/protobuf-gen.sh index 0c738f13a1..25d2feaeb9 100644 --- a/scripts/protobuf-gen.sh +++ b/scripts/protobuf-gen.sh @@ -1,15 +1,15 @@ #!/bin/bash -# shellcheck disable=SC2013,SC1004 - TR="y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/" -for x in $(sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { +sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { /PB_AUTOGEN_ST/d; + /^[ \t]*$/d; s/,.*$//; s/\tPB_//; p; - }' criu/include/protobuf-desc.h); do + }' criu/include/protobuf-desc.h | \ +while IFS= read -r x; do x_la=$(echo "$x" | sed $TR) x_uf=$(echo "$x" | sed -nr 's/^./&#\\\ /; From 6128eb6185b4ca61b2ec4fabe3dc28565e7f643c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 18:03:29 -0700 Subject: [PATCH 081/122] test/others/crit/test.sh: use bash array In fact an array (aptly named array) is already used in run_test2, so let's just make it an array right from the start. While at it, remove ls invocation. Signed-off-by: Kir Kolyshkin --- test/others/crit/test.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 7db88e0a90..5d13066e70 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -6,7 +6,7 @@ set -x # shellcheck source=test/others/env.sh source ../env.sh -images_list="" +images_list=() function gen_imgs { PID=$(../loop) @@ -17,15 +17,15 @@ function gen_imgs { exit 1 fi - images_list=$(ls -1 ./*.img) - if [ -z "$images_list" ]; then + images_list=(./*.img) + if [ "${#images_list[@]}" -eq 0 ]; then echo "Failed to generate images" exit 1 fi } function run_test1 { - for x in $images_list + for x in "${images_list[@]}" do echo "=== $x" if [[ $x == *pages* ]]; then @@ -46,9 +46,7 @@ function run_test1 { function run_test2 { - mapfile -t array <<< "$images_list" - - PROTO_IN=${array[0]} + PROTO_IN="${images_list[0]}" JSON_IN=$(mktemp -p ./ tmp.XXXXXXXXXX.json) OUT=$(mktemp -p ./ tmp.XXXXXXXXXX.log) From 58257cb35bb4b2e80b90c3672f626257ecdd34c0 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Wed, 14 Sep 2022 15:42:07 +0800 Subject: [PATCH 082/122] seize: do not overwrite exit code from failpath Signed-off-by: Liu Hua --- criu/seize.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 1333d6db97..f2af12a0bd 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -535,8 +535,10 @@ static int freeze_processes(void) } err: - if (exit_code == 0 || origin_freezer_state == THAWED) - exit_code = freezer_write_state(fd, THAWED); + if (exit_code == 0 || origin_freezer_state == THAWED) { + if (freezer_write_state(fd, THAWED)) + exit_code = -1; + } if (close(fd)) { pr_perror("Unable to thaw tasks"); From 6e9a908af9b515e85895354c4dac312c8da52184 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 2 Sep 2022 07:01:20 -0700 Subject: [PATCH 083/122] compel: Add APIs to facilitate testing Starting the daemon is the first time we run code in the victim using the parasite stack. It's useful for testing to be able to infect the victim without starting the daemon so that we can inspect the victim's state, set up stack guards, and so on before stack-related corruption can happen. Add compel_infect_no_daemon() to infect the victim but not start the daemon and compel_start_daemon() to start the daemon after the victim is infected. Add compel_get_stack() to get the victim's main and thread parasite stacks. Signed-off-by: Younes Manton --- compel/include/uapi/infect.h | 5 +++++ compel/src/lib/infect.c | 29 +++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 19d4da2b14..3bd36dda15 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -41,9 +41,12 @@ struct parasite_thread_ctl; extern struct parasite_ctl __must_check *compel_prepare(int pid); extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +extern int __must_check compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, + unsigned long args_size); extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); +extern int __must_check compel_start_daemon(struct parasite_ctl *ctl); extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); @@ -177,4 +180,6 @@ extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); +extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); + #endif diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 6413a1860b..5aab7aa3ee 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -967,7 +967,7 @@ static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) return ret; } -int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; @@ -1079,15 +1079,23 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l goto err; } - if (parasite_start_daemon(ctl)) - goto err; - return 0; err: return -1; } +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +{ + if (compel_infect_no_daemon(ctl, nr_threads, args_size)) + return -1; + + if (parasite_start_daemon(ctl)) + return -1; + + return 0; +} + struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; @@ -1427,6 +1435,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; } +int compel_start_daemon(struct parasite_ctl *ctl) +{ + return parasite_start_daemon(ctl); +} + int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { @@ -1772,3 +1785,11 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) { SET_REG_IP(tctl->th.regs, v); } + +void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack) +{ + if (rstack) + *rstack = ctl->rstack; + if (r_thread_stack) + *r_thread_stack = ctl->r_thread_stack; +} From 50dda158f6cd49ed041b128278edd54724beba96 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 27 Sep 2022 07:10:03 -0700 Subject: [PATCH 084/122] compel: Fix infect test to not override failures Signed-off-by: Younes Manton return zero on chk success Signed-off-by: Pavel Tikhomirov Co-authored-by: Pavel Tikhomirov --- compel/test/infect/spy.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index e7273b446a..b10db4d472 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -94,15 +94,15 @@ static inline int chk(int fd, int val) int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) - return 0; + return 1; printf("%d, want %d\n", v, val); - return v == val; + return v != val; } int main(int argc, char **argv) { - int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; /* * Prepare IO-s and fork the victim binary @@ -142,9 +142,11 @@ int main(int argc, char **argv) return 1; printf("Checking the victim alive\n"); - pass = chk(p_out[0], 1); - pass = chk(p_out[0], 42); - if (!pass) + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) return 1; /* @@ -176,14 +178,14 @@ int main(int argc, char **argv) printf("Checking the result\n"); /* These two came from parasite */ - pass = chk(p_out[0], 138); - pass = chk(p_out[0], 403); + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); /* These two came from post-infect */ - pass = chk(p_out[0], 1234); - pass = chk(p_out[0], 4096); + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); - if (pass) + if (!err) printf("All OK\n"); else printf("Something went WRONG\n"); From a7cbdcb0382cd3d0dea27f8e5bfed3b0fbf30b8b Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 30 Aug 2022 09:56:42 -0700 Subject: [PATCH 085/122] compel: Add test to check parasite stack setup Some ABIs allow functions to store data in caller frame, which means that we have to allocate an initial stack frame before executing code on the parasite stack. This test saves the contents of writable memory that follows the stack after the victim has been infected but before we start using the parasite stack. It later checks that the saved data matches the current contents of the two memory areas. This is done while the victim is halted so we expect a match unless executing parasite code caused memory corruption. The test doesn't detect cases where we corrupted memory by writing the same value. Signed-off-by: Younes Manton --- compel/test/Makefile | 8 +- compel/test/stack/.gitignore | 4 + compel/test/stack/Makefile | 32 +++ compel/test/stack/parasite.c | 38 ++++ compel/test/stack/spy.c | 405 ++++++++++++++++++++++++++++++++++ compel/test/stack/victim.c | 16 ++ test/zdtm/transition/Makefile | 1 + test/zdtm/transition/stack.c | 16 ++ 8 files changed, 518 insertions(+), 2 deletions(-) create mode 100644 compel/test/stack/.gitignore create mode 100644 compel/test/stack/Makefile create mode 100644 compel/test/stack/parasite.c create mode 100644 compel/test/stack/spy.c create mode 100644 compel/test/stack/victim.c create mode 100644 test/zdtm/transition/stack.c diff --git a/compel/test/Makefile b/compel/test/Makefile index 63fb76f80d..f46a821ee8 100644 --- a/compel/test/Makefile +++ b/compel/test/Makefile @@ -1,4 +1,4 @@ -all: fdspy infect rsys +all: fdspy infect rsys stack fdspy: $(Q) $(MAKE) -C fdspy @@ -10,8 +10,12 @@ infect: $(Q) $(MAKE) -C infect run .PHONY: infect - rsys: $(Q) $(MAKE) -C rsys $(Q) $(MAKE) -C rsys run .PHONY: rsys + +stack: + $(Q) $(MAKE) -C stack + $(Q) $(MAKE) -C stack run +.PHONY: stack diff --git a/compel/test/stack/.gitignore b/compel/test/stack/.gitignore new file mode 100644 index 0000000000..0a554758d1 --- /dev/null +++ b/compel/test/stack/.gitignore @@ -0,0 +1,4 @@ +parasite.h +parasite.po +spy +victim diff --git a/compel/test/stack/Makefile b/compel/test/stack/Makefile new file mode 100644 index 0000000000..bacfad9624 --- /dev/null +++ b/compel/test/stack/Makefile @@ -0,0 +1,32 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +run: + ./spy +.PHONY: run + +clean: + rm -f victim + rm -f spy + rm -f parasite.h + rm -f parasite.po + rm -f parasite.o + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c parasite.h + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) + +parasite.h: parasite.po + $(COMPEL) hgen -o $@ -f $< + +parasite.po: parasite.o + ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) + +parasite.o: parasite.c + $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/compel/test/stack/parasite.c b/compel/test/stack/parasite.c new file mode 100644 index 0000000000..ad13bd25de --- /dev/null +++ b/compel/test/stack/parasite.c @@ -0,0 +1,38 @@ +#include + +#include +#include + +/* + * Stubs for std compel plugin. + */ +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} +void parasite_cleanup(void) +{ +} + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +int parasite_daemon_cmd(int cmd, void *args) +{ + int v; + + switch (cmd) { + case PARASITE_CMD_INC: + v = (*(int *)args) + 1; + break; + case PARASITE_CMD_DEC: + v = (*(int *)args) - 1; + break; + default: + v = -1; + break; + } + + sys_write(1, &v, sizeof(int)); + return 0; +} diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c new file mode 100644 index 0000000000..9b7c9a7f09 --- /dev/null +++ b/compel/test/stack/spy.c @@ -0,0 +1,405 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "parasite.h" + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) + +void *saved_data = NULL; + +#define SAVED_DATA_MAX page_size() + +void cleanup_saved_data(void) +{ + free(saved_data); +} + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static void *get_parasite_rstack_start(struct parasite_ctl *ctl) +{ + void *rstack, *r_thread_stack, *rstack_start; + + compel_get_stack(ctl, &rstack, &r_thread_stack); + + rstack_start = rstack; + if (r_thread_stack != NULL && r_thread_stack < rstack_start) + rstack_start = r_thread_stack; + + return rstack_start; +} + +static int page_writable(struct parasite_ctl *ctl, int pid, void *page) +{ + FILE *maps; + size_t maps_line_len = 0; + char *maps_line = NULL; + char victim_maps_path[6 + 11 + 5 + 1]; + int written; + int ret = 0; + + if (((uintptr_t)page & (page_size() - 1)) != 0) { + fprintf(stderr, "Page address not aligned\n"); + ret = -1; + goto done; + } + + written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); + if (written < 0 || written >= sizeof(victim_maps_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); + ret = -1; + goto done; + } + + maps = fopen(victim_maps_path, "r"); + if (maps == NULL) { + perror("Can't open victim's /proc/$pid/maps"); + ret = -1; + goto done; + } + + while (getline(&maps_line, &maps_line_len, maps) != -1) { + unsigned long vmstart, vmend; + char r, w; + + if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { + fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); + ret = -1; + goto free_linebuf; + } + + if (page >= (void *)vmstart && page < (void *)vmend) { + if (w == 'w') { + if (r != 'r') { + fprintf(stderr, "Expecting writable memory to also be readable"); + ret = -1; + goto free_linebuf; + } + ret = 1; + } + break; + } + } + + if (errno) { + perror("Can't read victim's /proc/$pid/maps"); + ret = -1; + } + +free_linebuf: + free(maps_line); + fclose(maps); +done: + return ret; +} + +static void *read_proc_mem(int pid, void *offset, size_t len) +{ + char victim_mem_path[6 + 11 + 4 + 1]; + int written; + int fd; + void *data; + ssize_t mem_read; + + written = snprintf(victim_mem_path, sizeof(victim_mem_path), "/proc/%d/mem", pid); + if (written < 0 || written >= sizeof(victim_mem_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/mem file\n", pid); + return NULL; + } + + fd = open(victim_mem_path, O_RDONLY); + if (fd < 0) { + perror("Failed to open victim's /proc/$pid/mem file"); + return NULL; + } + + data = malloc(len); + if (data == NULL) { + perror("Can't allocate memory to read victim's /proc/$pid/mem file"); + return NULL; + } + + mem_read = pread(fd, data, len, (off_t)offset); + if (mem_read == -1) { + perror("Failed to read victim's /proc/$pid/mem file"); + goto freebuf; + } + + return data; + +freebuf: + free(data); + return NULL; +} + +static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, + size_t *saved_data_size) +{ + size_t page_mask = page_size() - 1; + size_t saved_size = 0; + size_t stack_size_last_page = (uintptr_t)stack & page_mask; + void *next_page = stack; + + if (stack_size_last_page != 0) { + size_t empty_space_last_page = page_size() - stack_size_last_page; + saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); + next_page += page_size() - stack_size_last_page; + } + + while (saved_size < SAVED_DATA_MAX && next_page != NULL) { + switch (page_writable(ctl, pid, next_page)) { + case 1: + saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); + next_page += page_size(); + break; + case 0: + next_page = NULL; + break; + default: + return -1; + } + } + + if (saved_size > 0) { + void *sd; + + sd = read_proc_mem(pid, stack, saved_size); + if (sd == NULL) + return -1; + + *saved_data = sd; + } else { + *saved_data = NULL; + } + + *saved_data_size = saved_size; + + return 0; +} + +static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) +{ + if (saved_data != NULL) { + void *current_data; + + current_data = read_proc_mem(pid, stack, saved_data_size); + if (current_data == NULL) + return -1; + + if (memcmp(saved_data, current_data, saved_data_size) != 0) + return 1; + } + + return 0; +} + +static int do_infection(int pid) +{ + int state; + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + int *arg; + void *stack; + size_t saved_data_size; + int saved_data_check; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task\n"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection\n"); + + printf("Configuring contexts\n"); + + /* + * First -- the infection context. Most of the stuff + * is already filled by compel_prepare(), just set the + * log descriptor for parasite side, library cannot + * live w/o it. + */ + ictx = compel_infect_ctx(ctl); + ictx->log_fd = STDERR_FILENO; + + parasite_setup_c_header(ctl); + + printf("Infecting\n"); + if (compel_infect_no_daemon(ctl, 1, sizeof(int))) + err_and_ret("Can't infect victim\n"); + + if (atexit(cleanup_saved_data)) + err_and_ret("Can't register cleanup function with atexit\n"); + + stack = get_parasite_rstack_start(ctl); + if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) + err_and_ret("Can't save data above stack\n"); + + if (compel_start_daemon(ctl)) + err_and_ret("Can't start daemon in victim\n"); + + /* + * Now get the area with arguments and run two + * commands one by one. + */ + arg = compel_parasite_args(ctl, int); + + printf("Running cmd 1\n"); + *arg = 137; + if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) + err_and_ret("Can't run parasite command 1\n"); + + printf("Running cmd 2\n"); + *arg = 404; + if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) + err_and_ret("Can't run parasite command 2\n"); + + saved_data_check = check_saved_data(ctl, pid, stack, saved_data, saved_data_size); + if (saved_data_check == -1) + err_and_ret("Could not check saved data\n"); + if (saved_data_check != 0) + err_and_ret("Saved data unexpectedly modified\n"); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim\n"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task\n"); + + printf("Done\n"); + + return 0; +} + +static inline int chk(int fd, int val) +{ + int v = 0; + + if (read(fd, &v, sizeof(v)) != sizeof(v)) + return 1; + + printf("%d, want %d\n", v, val); + return v != val; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + pid = vfork(); + if (pid == 0) { + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); + + /* + * Tell the little guy some numbers + */ + i = 1; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 42; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + printf("Checking the victim alive\n"); + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) + return 1; + + /* + * Now do the infection with parasite.c + */ + + printf("Infecting the victim\n"); + if (do_infection(pid)) + return 1; + + /* + * Tell the victim some more stuff to check it's alive + */ + i = 1234; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 4096; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + /* + * Stop the victim and check the infection went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the result\n"); + + /* These two came from parasite */ + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); + + /* These two came from post-infect */ + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); + + if (!err) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/compel/test/stack/victim.c b/compel/test/stack/victim.c new file mode 100644 index 0000000000..f94613fa15 --- /dev/null +++ b/compel/test/stack/victim.c @@ -0,0 +1,16 @@ +#include + +int main(int argc, char **argv) +{ + int i; + + while (1) { + if (read(0, &i, sizeof(i)) != sizeof(i)) + break; + + if (write(1, &i, sizeof(i)) != sizeof(i)) + break; + } + + return 0; +} diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile index 98440f4e2c..ab735bdd4e 100644 --- a/test/zdtm/transition/Makefile +++ b/test/zdtm/transition/Makefile @@ -25,6 +25,7 @@ TST_NOFILE = \ pidfd_store_sk \ rseq01 \ rseq02 \ + stack \ TST_FILE = \ diff --git a/test/zdtm/transition/stack.c b/test/zdtm/transition/stack.c new file mode 100644 index 0000000000..9548b91822 --- /dev/null +++ b/test/zdtm/transition/stack.c @@ -0,0 +1,16 @@ +#include "zdtmtst.h" + +const char *test_doc = "Tests that parasite code does not write past the start of the stack"; +const char *test_author = "Younes Manton "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} From 615763ec2d9777167f934f9e6a6830140a94b4ff Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 30 Aug 2022 08:18:21 -0700 Subject: [PATCH 086/122] compel: Fix ppc64le parasite stack layout The ppc64le ABI allows functions to store data in caller frames. When initializing the stack pointer prior to executing parasite code we need to pre-allocating the minimum sized stack frame before jumping to the parasite code. Signed-off-by: Younes Manton --- compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h | 5 +++++ compel/arch/ppc64/src/lib/infect.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index eb12c9f7cd..8cc94ba740 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -23,6 +23,11 @@ /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ #define USER_REDZONE_SIZE 512 +#if _CALL_ELF != 2 +#error Only supporting ABIv2. +#else +#define STACK_FRAME_MIN_SIZE 32 +#endif /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ #define TRAMP_SIZE 6 diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 61cd6e9857..db999ce37f 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -441,13 +441,13 @@ void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* - * OpenPOWER ABI requires that r12 is set to the calling function addressi + * OpenPOWER ABI requires that r12 is set to the calling function address * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) - regs->gpr[1] = (unsigned long)stack; + regs->gpr[1] = (unsigned long)stack - STACK_FRAME_MIN_SIZE; regs->trap = 0; } From 4cd295b9bbe850dd7e07f8160bd559e1a4f2e620 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 1 Oct 2022 22:19:24 +0100 Subject: [PATCH 087/122] ci: enable EPEL for CentOS 7 python2-future, python2-junit_xml, python-flake8 and libbsd-devel are now provided from EPEL. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 03ed797480..c7ed5027a3 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -111,6 +111,8 @@ task: memory: 8G setup_script: | + # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel # Even with selinux in permissive mode the selinux tests will be executed From 294aedcc417f77850df12a7a99198335b851fe29 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:03:45 +0000 Subject: [PATCH 088/122] non-root: add infrastructure to run as non-root The idea behind the rootless CRIU code is, that CRIU reads out its effective capabilities and stores that in the global opts structure. Different parts of CRIU can then, based on the existing capabilities, automatically enable or disable certain code paths. Currently at least CAP_CHECKPOINT_RESTORE is required. CRIU will not start without this capability. Signed-off-by: Adrian Reber --- criu/config.c | 3 +++ criu/cr-restore.c | 4 ++++ criu/include/cr_options.h | 17 ++++++++++++++++- criu/include/restorer.h | 3 +++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/criu/config.c b/criu/config.c index 24c445c8bd..c078848ec2 100644 --- a/criu/config.c +++ b/criu/config.c @@ -705,6 +705,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, #undef BOOL_OPT + if (argv && argv[0]) + SET_CHAR_OPTS(argv_0, argv[0]); + ret = pre_parse(argc, argv, usage_error, &no_default_config, &cfg_file); if (ret) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9c480be789..cd8705822c 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3748,6 +3748,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns prep_libc_rseq_info(&task_args->libc_rseq); + task_args->uid = opts.uid; + for (i = 0; i < CR_CAP_SIZE; i++) + task_args->cap_eff[i] = opts.cap_eff[i]; + /* * Fill up per-thread data. */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index e544a2d9a1..6e85dff0a9 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -1,10 +1,11 @@ #ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ -#include #include #include "common/config.h" #include "common/list.h" +#include "int.h" +#include "image.h" /* Configuration and CLI parsing order defines */ #define PARSING_GLOBAL_CONF 1 @@ -210,6 +211,20 @@ struct cr_options { enum criu_mode mode; int mntns_compat_mode; + + /* Remember the program name passed to main() so we can use it in + * error messages elsewhere. + */ + char *argv_0; + /* + * This contains the eUID of the current CRIU user. It + * will only be set to a non-zero value if CRIU has + * the necessary capabilities to run as non root. + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN + */ + uid_t uid; + /* This contains the value from /proc/pid/status: CapEff */ + u32 cap_eff[CR_CAP_SIZE]; }; extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 325804e449..d642765e3f 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -235,6 +235,9 @@ struct task_restore_args { * unregister it before memory restoration procedure */ struct rst_rseq_param libc_rseq; + + uid_t uid; + u32 cap_eff[CR_CAP_SIZE]; } __aligned(64); /* From de70d2c9c10daac00d8e7c0f20da33eb31c48993 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:08:07 +0000 Subject: [PATCH 089/122] non-root: add functions to work with capabilities This adds the function check_caps() which checks if CRIU is running with at least CAP_CHECKPOINT_RESTORE. That is the minimum capability CRIU needs to do a minimal checkpoint and restore from it. In addition helper functions are added to easily query for other capability for enhanced checkpoint/restore support. Co-authored-by: Younes Manton Signed-off-by: Adrian Reber Signed-off-by: Younes Manton --- criu/cr-check.c | 46 ++++++++++++++++++++++++++++++- criu/include/crtools.h | 1 + criu/include/util-caps.h | 58 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 criu/include/util-caps.h diff --git a/criu/cr-check.c b/criu/cr-check.c index 6c95ffb254..b90e6a9bfd 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -29,7 +29,7 @@ #include "sockets.h" #include "crtools.h" #include "log.h" -#include "util-pie.h" +#include "util-caps.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" @@ -1655,3 +1655,47 @@ static char *feature_name(int (*func)(void)) } return NULL; } + +static int pr_set_dumpable(int value) +{ + int ret = prctl(PR_SET_DUMPABLE, value, 0, 0, 0); + if (ret < 0) + pr_perror("Unable to set PR_SET_DUMPABLE"); + return ret; +} + +int check_caps(void) +{ + struct proc_status_creds creds; + int exit_code = -1; + + if (parse_pid_status(PROC_SELF, &creds.s, NULL)) + goto out; + + memcpy(&opts.cap_eff, &creds.cap_eff, sizeof(u32) * PROC_CAP_SIZE); + + if (!has_cap_checkpoint_restore(opts.cap_eff)) + goto out; + + /* For some things we need to know if we are running as root. */ + opts.uid = geteuid(); + + if (opts.uid) { + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + } + + exit_code = 0; +out: + if (exit_code) { + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + } + + return exit_code; +} diff --git a/criu/include/crtools.h b/criu/include/crtools.h index b9309654f9..b54b9d9294 100644 --- a/criu/include/crtools.h +++ b/criu/include/crtools.h @@ -26,6 +26,7 @@ extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); +extern int check_caps(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); diff --git a/criu/include/util-caps.h b/criu/include/util-caps.h new file mode 100644 index 0000000000..7ccd162f5e --- /dev/null +++ b/criu/include/util-caps.h @@ -0,0 +1,58 @@ +#ifndef __CR_UTIL_CAPS_H__ +#define __CR_UTIL_CAPS_H__ + +#include + +#ifndef CAP_CHECKPOINT_RESTORE +#define CAP_CHECKPOINT_RESTORE 40 +#endif + +static inline bool has_capability(int cap, u32 *cap_eff) +{ + int mask = CAP_TO_MASK(cap); + int index = CAP_TO_INDEX(cap); + u32 effective; + + effective = cap_eff[index]; + + if (!(mask & effective)) { + pr_debug("Effective capability %d missing\n", cap); + return false; + } + + return true; +} + +static inline bool has_cap_checkpoint_restore(u32 *cap_eff) +{ + /* + * Everything guarded by CAP_CHECKPOINT_RESTORE is also + * guarded by CAP_SYS_ADMIN. Check for both capabilities. + */ + if (has_capability(CAP_CHECKPOINT_RESTORE, cap_eff) || has_capability(CAP_SYS_ADMIN, cap_eff)) + return true; + + return false; +} + +static inline bool has_cap_net_admin(u32 *cap_eff) +{ + return has_capability(CAP_NET_ADMIN, cap_eff); +} + +static inline bool has_cap_sys_chroot(u32 *cap_eff) +{ + return has_capability(CAP_SYS_CHROOT, cap_eff); +} + +static inline bool has_cap_setuid(u32 *cap_eff) +{ + return has_capability(CAP_SETUID, cap_eff); +} + +static inline bool has_cap_sys_resource(u32 *cap_eff) +{ + return has_capability(CAP_SYS_RESOURCE, cap_eff); +} + +#endif /* __CR_UTIL_CAPS_H__ */ From 3b5f5c7d485964500215ba74b874195b60df85fd Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 12 Aug 2022 11:56:53 -0700 Subject: [PATCH 090/122] non-root: enable non-root checkpoint/restore This commit enables checkpointing and restoring of applications as non-root. First goal was to enable checkpoint and restore of the env00 and pthread00 test case. This uses the information from opts.unprivileged and opts.cap_eff to skip certain code paths which do not work as non-root. Co-authored-by: Adrian Reber Signed-off-by: Younes Manton --- criu/cgroup.c | 6 ++++ criu/config.c | 1 + criu/cr-check.c | 71 ++++++++++++++++++++++++--------------- criu/cr-restore.c | 3 ++ criu/cr-service.c | 7 ++++ criu/crtools.c | 5 +++ criu/fdstore.c | 16 +++++++-- criu/files.c | 46 +++++++++++++++++++++---- criu/image.c | 3 +- criu/include/cr_options.h | 11 ++++-- criu/include/util.h | 2 ++ criu/namespaces.c | 11 +++--- criu/pie/restorer.c | 26 ++++++++------ criu/timens.c | 4 +++ criu/util.c | 22 ++++++++++++ images/rpc.proto | 1 + lib/c/criu.c | 11 ++++++ lib/c/criu.h | 1 + 18 files changed, 194 insertions(+), 53 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 325df6a1db..d886ce9f26 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -734,6 +734,9 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ unsigned int n_ctls = 0; struct cg_set *cs; + if (opts.unprivileged) + return 0; + if (item) pid = item->pid->real; else @@ -989,6 +992,9 @@ int dump_cgroups(void) CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; + if (opts.unprivileged) + return 0; + BUG_ON(!criu_cgset || !root_cgset); /* diff --git a/criu/config.c b/criu/config.c index c078848ec2..9ba79c8ef3 100644 --- a/criu/config.c +++ b/criu/config.c @@ -700,6 +700,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), + BOOL_OPT("unprivileged", &opts.unprivileged), {}, }; diff --git a/criu/cr-check.c b/criu/cr-check.c index b90e6a9bfd..b54c79387d 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "../soccr/soccr.h" @@ -515,6 +516,14 @@ static int check_ipc(void) { int ret; + /* + * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however + * for non-root users access() runs with an empty set of caps and will therefore always + * fail. + */ + if (opts.uid) + return 0; + ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; @@ -1039,10 +1048,14 @@ static int check_tcp(void) } val = 1; - ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); - if (ret < 0) { - pr_perror("Can't turn TCP repair mode ON"); - goto out; + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; + } + } else { + pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); } optlen = sizeof(val); @@ -1394,9 +1407,6 @@ int cr_check(void) struct ns_id *ns; int ret = 0; - if (!is_root_user()) - return -1; - root_item = alloc_pstree_item(); if (root_item == NULL) return -1; @@ -1666,36 +1676,43 @@ static int pr_set_dumpable(int value) int check_caps(void) { - struct proc_status_creds creds; - int exit_code = -1; - - if (parse_pid_status(PROC_SELF, &creds.s, NULL)) + /* Read out effective capabilities and store in opts.cap_eff. */ + if (set_opts_cap_eff()) goto out; - memcpy(&opts.cap_eff, &creds.cap_eff, sizeof(u32) * PROC_CAP_SIZE); - + /* + * No matter if running as root or not. CRIU always needs + * at least these capabilities. + */ if (!has_cap_checkpoint_restore(opts.cap_eff)) goto out; /* For some things we need to know if we are running as root. */ opts.uid = geteuid(); - if (opts.uid) { - /* - * At his point we know we are running as non-root with the necessary - * capabilities available. Now we have to make the process dumpable - * so that /proc/self is not owned by root. - */ - if (pr_set_dumpable(1)) - return -1; + if (!opts.uid) { + /* CRIU is running as root. No further checks are necessary. */ + return 0; } - exit_code = 0; -out: - if (exit_code) { - pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); - pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + if (!opts.unprivileged) { + pr_msg("Running as non-root requires '--unprivileged'\n"); + pr_msg("Please consult the documentation for limitations when running as non-root\n"); + return -1; } - return exit_code; + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + + return 0; +out: + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + + return -1; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index cd8705822c..d7d3d8edb7 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1809,6 +1809,9 @@ static int restore_task_with_children(void *_arg) goto err; } + if (set_opts_cap_eff()) + goto err; + /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; diff --git a/criu/cr-service.c b/criu/cr-service.c index 1d9f0aca3b..73c48f5a6c 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "version.h" #include "crtools.h" @@ -409,6 +410,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req) pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); } + if (req->has_unprivileged) + opts.unprivileged = req->unprivileged; + + if (check_caps()) + return 1; + if (kerndat_init()) return 1; diff --git a/criu/crtools.c b/criu/crtools.c index 8bcbe8e38f..ac05bc8215 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -185,6 +185,9 @@ int main(int argc, char *argv[], char *envp[]) return cr_service_work(atoi(argv[optind + 1])); } + if (check_caps()) + return 1; + if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); @@ -414,6 +417,8 @@ int main(int argc, char *argv[], char *envp[]) " --network-lock METHOD\n" " network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" + " --unprivileged accept limitations when running as non-root\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/fdstore.c b/criu/fdstore.c index 6a7f73a598..03afa9f178 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -13,6 +13,8 @@ #include "rst-malloc.h" #include "log.h" #include "util.h" +#include "cr_options.h" +#include "util-caps.h" /* clang-format off */ static struct fdstore_desc { @@ -27,6 +29,8 @@ int fdstore_init(void) uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 }; struct sockaddr_un addr; unsigned int addrlen; + int rcv_opt_name; + int snd_opt_name; struct stat st; int sk, ret; @@ -49,8 +53,16 @@ int fdstore_init(void) return -1; } - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + rcv_opt_name = SO_RCVBUFFORCE; + snd_opt_name = SO_SNDBUFFORCE; + } else { + rcv_opt_name = SO_RCVBUF; + snd_opt_name = SO_SNDBUF; + } + + if (setsockopt(sk, SOL_SOCKET, snd_opt_name, &buf[0], sizeof(buf[0])) < 0 || + setsockopt(sk, SOL_SOCKET, rcv_opt_name, &buf[1], sizeof(buf[1])) < 0) { pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); close(sk); return -1; diff --git a/criu/files.c b/criu/files.c index 8a2250e193..38dc076d20 100644 --- a/criu/files.c +++ b/criu/files.c @@ -21,7 +21,7 @@ #include "image.h" #include "common/list.h" #include "rst-malloc.h" -#include "util-pie.h" +#include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" @@ -1346,10 +1346,35 @@ static int fchroot(int fd) return chroot("."); } +static int need_chroot(int saved_root) +{ + struct stat saved_root_stat, cur_root_stat; + int psd; + + if (fstat(saved_root, &saved_root_stat) == -1) { + pr_perror("Failed to stat saved root dir"); + return -1; + } + + psd = open_pid_proc(PROC_SELF); + if (psd < 0) { + pr_perror("Failed to open PROC_SELF"); + return -1; + } + + if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { + pr_perror("Failed to stat current root dir"); + return -1; + } + + return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; +} + int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); + bool do_chroot = true; /* * First -- open both descriptors. We will not @@ -1368,15 +1393,24 @@ int restore_fs(struct pstree_item *me) goto out; } + /* + * In unprivileged mode chroot() may fail if we don't have + * sufficient privileges, therefore only do it if the process + * is actually chrooted. + */ + if (opts.unprivileged) + do_chroot = need_chroot(dd_root); + /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ - - ret = fchroot(dd_root); - if (ret < 0) { - pr_perror("Can't change root"); - goto out; + if (do_chroot) { + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; + } } ret = fchdir(dd_cwd); diff --git a/criu/image.c b/criu/image.c index 353de48e8f..3c2127ac6e 100644 --- a/criu/image.c +++ b/criu/image.c @@ -226,7 +226,8 @@ int prepare_inventory(InventoryEntry *he) if (get_task_ids(&crt.i)) return -1; - he->has_root_cg_set = true; + if (!opts.unprivileged) + he->has_root_cg_set = true; if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) return -1; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 6e85dff0a9..eacaa03a67 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -2,6 +2,7 @@ #define __CR_OPTIONS_H__ #include +#include #include "common/config.h" #include "common/list.h" #include "int.h" @@ -223,8 +224,14 @@ struct cr_options { * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN */ uid_t uid; - /* This contains the value from /proc/pid/status: CapEff */ - u32 cap_eff[CR_CAP_SIZE]; + /* This contains the value from capget()->effective */ + u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; + /* + * If CRIU should be running as non-root with the help of + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should + * explicitly request it as it comes with many limitations. + */ + int unprivileged; }; extern struct cr_options opts; diff --git a/criu/include/util.h b/criu/include/util.h index 4e29c079ef..3a0403113e 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -386,6 +386,8 @@ extern int mount_detached_fs(const char *fsname); extern char *get_legacy_iptables_bin(bool ipv6); +extern int set_opts_cap_eff(void); + extern ssize_t read_all(int fd, void *buf, size_t size); extern ssize_t write_all(int fd, const void *buf, size_t size); diff --git a/criu/namespaces.c b/criu/namespaces.c index 7356fe8c2f..286073ff6b 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -28,6 +28,7 @@ #include "cgroup.h" #include "fdstore.h" #include "kerndat.h" +#include "util-caps.h" #include "protobuf.h" #include "util.h" @@ -1623,10 +1624,12 @@ int collect_namespaces(bool for_dump) int prepare_userns_creds(void) { - /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ - if (setuid(0) || setgid(0) || setgroups(0, NULL)) { - pr_perror("Unable to initialize id-s"); - return -1; + if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; + } } /* diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index f80b68359b..0e98cb3dab 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -184,7 +184,7 @@ static int lsm_set_label(char *label, char *type, int procfd) return 0; } -static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type) +static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type, uid_t uid) { CredsEntry *ce = &args->creds; int b, i, ret; @@ -211,10 +211,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * lose caps bits when changing xids. */ - ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); - if (ret) { - pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); + if (ret) { + pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); + return -1; + } } /* @@ -252,10 +254,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * special state any longer. */ - ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); - if (ret) { - pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); + return -1; + } } /* @@ -634,7 +638,7 @@ long __export_restore_thread(struct thread_restore_args *args) if (restore_seccomp(args)) BUG(); - ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); + ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type, args->ta->uid); ret = ret || restore_dumpable_flag(&args->ta->mm); ret = ret || restore_pdeath_sig(args); if (ret) @@ -1915,7 +1919,7 @@ long __export_restore_task(struct task_restore_args *args) * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ - ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); + ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type, args->uid); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper); diff --git a/criu/timens.c b/criu/timens.c index 5803fc3594..66c0c02a42 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -5,6 +5,7 @@ #include "proc_parse.h" #include "namespaces.h" #include "timens.h" +#include "cr_options.h" #include "protobuf.h" #include "images/timens.pb-c.h" @@ -57,6 +58,9 @@ int prepare_timens(int id) struct timespec ts; struct timespec prev_moff = {}, prev_boff = {}; + if (opts.unprivileged) + return 0; + img = open_image(CR_FD_TIMENS, O_RSTR, id); if (!img) return -1; diff --git a/criu/util.c b/criu/util.c index 060ca3bd44..b3b2b6659d 100644 --- a/criu/util.c +++ b/criu/util.c @@ -41,6 +41,7 @@ #include "namespaces.h" #include "criu-log.h" #include "syscall.h" +#include "util-caps.h" #include "clone-noasan.h" #include "cr_options.h" @@ -1426,6 +1427,9 @@ void rlimit_unlimit_nofile(void) { struct rlimit new; + if (opts.unprivileged && !has_cap_sys_resource(opts.cap_eff)) + return; + new.rlim_cur = kdat.sysctl_nr_open; new.rlim_max = kdat.sysctl_nr_open; @@ -2064,3 +2068,21 @@ char *resolve_mountpoint(char *path) xfree(free_path); return mp_path; } + +int set_opts_cap_eff(void) +{ + struct __user_cap_header_struct cap_header; + struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3]; + int i; + + cap_header.version = _LINUX_CAPABILITY_VERSION_3; + cap_header.pid = getpid(); + + if (capget(&cap_header, &cap_data[0])) + return -1; + + for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++) + memcpy(&opts.cap_eff[i], &cap_data[i].effective, sizeof(u32)); + + return 0; +} diff --git a/images/rpc.proto b/images/rpc.proto index 3cf431639c..afd2c7b43f 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -139,6 +139,7 @@ message criu_opts { optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; optional bool mntns_compat_mode = 65; optional bool skip_file_rwx_check = 66; + optional bool unprivileged = 67; /* optional bool check_mounts = 128; */ } diff --git a/lib/c/criu.c b/lib/c/criu.c index 8171f7a126..fc8159999c 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -566,6 +566,17 @@ void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); } +void criu_local_set_unprivileged(criu_opts *opts, bool unprivileged) +{ + opts->rpc->has_unprivileged = true; + opts->rpc->unprivileged = unprivileged; +} + +void criu_set_unprivileged(bool unprivileged) +{ + criu_local_set_unprivileged(global_opts, unprivileged); +} + void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; diff --git a/lib/c/criu.h b/lib/c/criu.h index c32a8a6462..28a083d88d 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -79,6 +79,7 @@ void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); +void criu_set_unprivileged(bool unprivileged); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); From 2cb3da2ff5367e43581b316f50c15e79fc9817c0 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 12 Aug 2022 11:58:01 -0700 Subject: [PATCH 091/122] non-root: Introduce unprivileged mode to kerndat This patch modifies how kerndat is handled in unprivileged mode. Initialization and functionality that can only be done as root is made separate from common code. The kerndat file's location is defined as $XDG_RUNTIME_DIR/criu.kdat in unprivileged mode. Since we expect that directory to be on tmpfs we maintain the same behavior as the root-mode kerndat which lives in /run. Co-authored-by: Adrian Reber Signed-off-by: Younes Manton --- criu/kerndat.c | 186 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 142 insertions(+), 44 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 0f7d5fc8fb..a209190eea 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -21,6 +21,7 @@ #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include #endif +#include #include "common/config.h" #include "int.h" @@ -51,6 +52,7 @@ #include "sched.h" #include "memfd.h" #include "mount-v2.h" +#include "util-caps.h" struct kerndat_s kdat = {}; @@ -1075,19 +1077,66 @@ static int kerndat_has_openat2(void) return 0; } -#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" -#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" +#define KERNDAT_CACHE_NAME "criu.kdat" +#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME +/* + * Returns: + * -1 if kdat_file was not written due to error + * 0 if kdat_file was written + * 1 if kdat_file was not written because cache directory undefined in env (non-root mode) + */ +static int get_kerndat_filename(char **kdat_file) +{ + int ret; + + /* + * Running as non-root, even with CAP_CHECKPOINT_RESTORE, does not + * allow to write to KDAT_RUNDIR which usually is only writable by root. + * Let's write criu.kdat file to XDG_RUNTIME_DIR for non-root cases. + * Note that XDG_RUNTIME_DIR is not always defined (e.g. when executing + * via su/sudo). + */ + if (opts.unprivileged) { + const char *cache_dir = getenv("XDG_RUNTIME_DIR"); + if (!cache_dir) { + pr_warn("$XDG_RUNTIME_DIR not set. Cannot find location for kerndat file\n"); + return 1; + } + ret = asprintf(kdat_file, "%s/%s", cache_dir, KERNDAT_CACHE_NAME); + } else { + ret = asprintf(kdat_file, "%s", KERNDAT_CACHE_FILE); + } + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return -1; + } + + return 0; +} + +/* + * Returns: + * -1 if error + * 0 if cache was loaded + * 1 if cache does not exist or is stale or cache directory undefined in env (non-root mode) + */ static int kerndat_try_load_cache(void) { + cleanup_free char *kdat_file = NULL; int fd, ret; - fd = open(KERNDAT_CACHE_FILE, O_RDONLY); + ret = get_kerndat_filename(&kdat_file); + if (ret) + return ret; + + fd = open(kdat_file, O_RDONLY); if (fd < 0) { if (ENOENT == errno) - pr_debug("File %s does not exist\n", KERNDAT_CACHE_FILE); + pr_debug("File %s does not exist\n", kdat_file); else - pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); + pr_warn("Can't load %s\n", kdat_file); return 1; } @@ -1101,12 +1150,12 @@ static int kerndat_try_load_cache(void) close(fd); if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { - pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); - unlink(KERNDAT_CACHE_FILE); + pr_warn("Stale %s file\n", kdat_file); + unlink(kdat_file); return 1; } - pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); + pr_info("Loaded kdat cache from %s\n", kdat_file); return 0; } @@ -1114,8 +1163,20 @@ static void kerndat_save_cache(void) { int fd, ret; struct statfs s; + cleanup_free char *kdat_file = NULL; + cleanup_free char *kdat_file_tmp = NULL; + + if (get_kerndat_filename(&kdat_file)) + return; + + ret = asprintf(&kdat_file_tmp, "%s.tmp", kdat_file); + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return; + } - fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); + fd = open(kdat_file_tmp, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu @@ -1124,6 +1185,10 @@ static void kerndat_save_cache(void) */ return; + /* + * If running as root we store the cache file on a tmpfs (/run), + * because the file should be gone after reboot. + */ if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); @@ -1137,20 +1202,21 @@ static void kerndat_save_cache(void) */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; + ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) - ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); + ret = rename(kdat_file_tmp, kdat_file); else { ret = -1; errno = EIO; } if (ret < 0) { - pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); + pr_perror("Couldn't save %s", kdat_file); unl: - unlink(KERNDAT_CACHE_FILE_TMP); + unlink(kdat_file); } } @@ -1158,6 +1224,14 @@ static int kerndat_uffd(void) { int uffd, err = 0; + if (opts.unprivileged) + /* + * If running as non-root uffd_open() fails with + * 'Operation not permitted'. Just ignore uffd for + * non-root for now. + */ + return 0; + kdat.uffd_features = 0; uffd = uffd_open(0, &kdat.uffd_features, &err); @@ -1499,6 +1573,45 @@ int kerndat_try_load_new(void) return 0; } +static int root_only_init(void) +{ + int ret = 0; + + if (opts.unprivileged) + return 0; + + if (!ret && kerndat_loginuid()) { + pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_tun_netns()) { + pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_unix_file()) { + pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_link_nsid()) { + pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_netns()) { + pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_nftables_concat()) { + pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_move_mount_set_group()) { + pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); + ret = -1; + } + + return ret; +} + int kerndat_init(void) { int ret; @@ -1516,7 +1629,16 @@ int kerndat_init(void) memset(&kdat, 0, sizeof(kdat)); preload_socket_modules(); - preload_netfilter_modules(); + if (!opts.unprivileged) + /* + * This uses 'iptables -L' to implicitly load necessary modules. + * If the non nft backed iptables is used it does a + * openat(AT_FDCWD, "/run/xtables.lock", O_RDONLY|O_CREAT, 0600) = -1 EACCES + * which will fail as non-root. There are no capabilities to + * change this. The iptables nft backend fails with + * openat(AT_FDCWD, "/proc/net/ip_tables_names", O_RDONLY) = -1 EACCES + */ + preload_netfilter_modules(); if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); @@ -1554,10 +1676,14 @@ int kerndat_init(void) pr_err("get_ipv6 failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_loginuid()) { - pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + if (!ret && kerndat_nsid()) { + pr_err("kerndat_nsid failed when initializing kerndat.\n"); ret = -1; } + + if (!ret && root_only_init()) + ret = -1; + if (!ret && kerndat_iptables_has_xtlocks()) { pr_err("kerndat_iptables_has_xtlocks failed when initializing kerndat.\n"); ret = -1; @@ -1570,22 +1696,6 @@ int kerndat_init(void) pr_err("kerndat_compat_restore failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_tun_netns()) { - pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_socket_unix_file()) { - pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_nsid()) { - pr_err("kerndat_nsid failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_link_nsid()) { - pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_memfd_create()) { pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); ret = -1; @@ -1616,10 +1726,6 @@ int kerndat_init(void) pr_err("kerndat_vdso_preserves_hint failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_socket_netns()) { - pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_x86_has_ptrace_fpu_xsave_bug()) { pr_err("kerndat_x86_has_ptrace_fpu_xsave_bug failed when initializing kerndat.\n"); ret = -1; @@ -1644,7 +1750,7 @@ int kerndat_init(void) pr_err("has_time_namespace failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_newifindex()) { + if (!ret && (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) && kerndat_has_newifindex()) { pr_err("kerndat_has_newifindex failed when initializing kerndat.\n"); ret = -1; } @@ -1658,18 +1764,10 @@ int kerndat_init(void) pr_err("kerndat_has_nspid failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_nftables_concat()) { - pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_sockopt_buf_lock()) { pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_move_mount_set_group()) { - pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_openat2()) { pr_err("kerndat_has_openat2 failed when initializing kerndat.\n"); ret = -1; From 1db95afd02c754beea33c6071b4070e3c35cf77b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 3 May 2021 14:14:28 +0000 Subject: [PATCH 092/122] Documentation: add details about --unprivileged This adds the non-root section and information about the parameter --unprivileged to the man page. Co-authored-by: Anna Singleton Signed-off-by: Adrian Reber Signed-off-by: Anna Singleton --- Documentation/criu.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 8d2e91443d..3b68f16a4c 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,12 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty +*--unprivileged*:: + This option tells *criu* to accept the limitations when running + as non-root. Running as non-root requires *criu* at least to have + *CAP_SYS_ADMIN* or *CAP_CHECKPOINT_RESTORE*. For details about running + *criu* as non-root please consult the *NON-ROOT* section. + *-V*, *--version*:: Print program version and exit. @@ -877,6 +883,32 @@ configuration file will overwrite all other configuration file settings or RPC options. *This can lead to undesired behavior of criu and should only be used carefully.* +NON-ROOT +-------- +*criu* can be used as non-root with either the *CAP_SYS_ADMIN* capability +or with the *CAP_CHECKPOINT_RESTORE* capability introduces in Linux kernel 5.9. +*CAP_CHECKPOINT_RESTORE* is the minimum that is required. + +*criu* also needs either *CAP_SYS_PTRACE* or a value of 0 in +*/proc/sys/kernel/yama/ptrace_scope* (see *ptrace*(2)) to be able to interrupt +the process for dumping. + +Running *criu* as non-root has many limitations and depending on the process +to checkpoint and restore it may not be possible. + +In addition to *CAP_CHECKPOINT_RESTORE* it is possible to give *criu* additional +capabilities to enable additional features in non-root mode. + +Currently *criu* can benefit from the following additional capabilities: + + - *CAP_NET_ADMIN* + - *CAP_SYS_CHROOT* + - *CAP_SETUID* + - *CAP_SYS_RESOURCE* + +Independent of the capabilities it is always necessary to use "*--unprivileged*" to +accept *criu*'s limitation in non-root mode. + EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into From 0add1b6cdbedaaac93d48ea323beceb7328b69b2 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:18:31 +0000 Subject: [PATCH 093/122] non-root: extend zdtm.py to be able to run tests as non-root These are the minimal changes to make zdtm.py successfully run the env00 and pthread test case as non-root using the '--rootless' zdtm option. Co-authored-by: Younes Manton Signed-off-by: Adrian Reber Signed-off-by: Younes Manton --- test/zdtm.py | 56 +++++++++++++++++++++++++++++++++-------- test/zdtm/lib/test.c | 49 +++++++++++++++++++----------------- test/zdtm_ct.c | 60 ++++++++++++++++++++++++-------------------- 3 files changed, 104 insertions(+), 61 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index aefcb36a4f..a311610c3f 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -41,6 +41,8 @@ prev_line = None uuid = uuid.uuid4() +NON_ROOT_UID = 65534 + def alarm(*args): print("==== ALARM ====") @@ -392,10 +394,11 @@ def __init__(self, cr_action): class zdtm_test: - def __init__(self, name, desc, flavor, freezer): + def __init__(self, name, desc, flavor, freezer, rootless): self.__name = name self.__desc = desc self.__freezer = None + self.__rootless = rootless self.__make_action('cleanout') self.__pid = 0 self.__flavor = flavor @@ -439,6 +442,8 @@ def __wait_task_die(self): wait_pid_die(int(self.__pid), self.__name, self.__timeout) def __add_wperms(self): + if os.getuid() != 0: + return # Add write perms for .out and .pid files for b in self._bins: p = os.path.dirname(b) @@ -457,6 +462,9 @@ def start(self): env['ZDTM_NOTIFY_FDIN'] = "100" env['ZDTM_NOTIFY_FDOUT'] = "101" + if self.__rootless: + env['ZDTM_ROOTLESS'] = "1" + if not test_flag(self.__desc, 'suid'): # Numbers should match those in criu env['ZDTM_UID'] = "18943" @@ -618,11 +626,15 @@ def available(): ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) + if opts['rootless']: + return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups", str(uuid)]) @staticmethod def cleanup(): + if opts['rootless']: + return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_umount_cgroups", str(uuid)]) @@ -640,7 +652,9 @@ def load_module_from_file(name, path): class inhfd_test: - def __init__(self, name, desc, flavor, freezer): + def __init__(self, name, desc, flavor, freezer, rootless): + if rootless: + raise test_fail_exc("This kind of test does not currently support rootless mode") self.__name = os.path.basename(name) print("Load %s" % name) self.__fdtyp = load_module_from_file(self.__name, name) @@ -801,8 +815,8 @@ def cleanup(): class groups_test(zdtm_test): - def __init__(self, name, desc, flavor, freezer): - zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer) + def __init__(self, name, desc, flavor, freezer, rootless): + zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer, rootless) if flavor.ns: self.__real_name = name with open(name) as fd: @@ -1039,6 +1053,7 @@ def __init__(self, opts): self.__dedup = bool(opts['dedup']) self.__mdedup = bool(opts['noauto_dedup']) self.__user = bool(opts['user']) + self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) self.__stream = bool(opts['stream']) self.__show_stats = bool(opts['show_stats']) @@ -1138,6 +1153,9 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): print("Run criu " + action) + if self.__rootless: + s_args += ["--unprivileged"] + strace = [] if self.__sat: fname = os.path.join(self.__ddir(), action + '.strace') @@ -1156,7 +1174,10 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): if action == "restore": preexec = None else: - preexec = self.__user and self.set_user_id or None + if os.getuid(): + preexec = None + else: + preexec = self.__user and self.set_user_id or None __ddir = self.__ddir() @@ -1476,10 +1497,11 @@ def check(feature): except Exception: return False - return criu_cli.run( - "check", - ["--no-default-config", "--verbosity=0", "--feature", feature], - opts['criu_bin']) == 0 + args = ["--no-default-config", "-verbosity=0", "--feature", feature] + if opts['rootless']: + args += ["--unprivileged"] + + return criu_cli.run("check", args, opts['criu_bin']) == 0 @staticmethod def available(): @@ -1900,7 +1922,7 @@ def do_run_test(tname, tdesc, flavs, opts): if opts['dry_run']: continue flav = flavors[f](opts) - t = tclass(tname, tdesc, flav, fcg) + t = tclass(tname, tdesc, flav, fcg, opts['rootless']) cr_api = criu(opts) try: @@ -2051,7 +2073,8 @@ def run_test(self, name, desc, flavor): 'sat', 'script', 'rpc', 'criu_config', 'lazy_pages', 'join_ns', 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', - 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode') + 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', + 'rootless') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2061,6 +2084,9 @@ def run_test(self, name, desc, flavor): logf = None log = None + if opts['rootless'] and os.getuid() == 0: + os.setgid(NON_ROOT_UID) + os.setuid(NON_ROOT_UID) sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], env=dict(os.environ, CR_CT_TEST_INFO=arg), stdout=log, @@ -2600,6 +2626,10 @@ def set_nr_hugepages(nr): with open("/proc/sys/vm/nr_hugepages", "w") as f: f.write("{}\n".format(nr)) return orig_hugepages + except PermissionError as err: + # EACCES is expected when running as non-root, otherwise re-raise the exception. + if err.errno != errno.EACCES or os.getuid() == 0: + raise except OSError as err: if err.errno != errno.EOPNOTSUPP: raise @@ -2673,6 +2703,10 @@ def get_cli_args(): rp.add_argument("--freezecg", help="Use freeze cgroup (path:state)") rp.add_argument("--user", help="Run CRIU as regular user", action='store_true') + rp.add_argument( + "--rootless", + help="Run CRIU rootless (uid!=0) (needs CAP_CHECKPOINT_RESTORE)", + action='store_true') rp.add_argument("--rpc", help="Run CRIU via RPC rather than CLI", action='store_true') diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 57eb42046a..6291ea4a7b 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -239,34 +239,37 @@ void test_init(int argc, char **argv) exit(1); } - val = getenv("ZDTM_GROUPS"); - if (val) { - char *tok = NULL; - unsigned int size = 0, groups[NGROUPS_MAX]; - - tok = strtok(val, " "); - while (tok) { - size++; - groups[size - 1] = atoi(tok); - tok = strtok(NULL, " "); + val = getenv("ZDTM_ROOTLESS"); + if (!val) { + val = getenv("ZDTM_GROUPS"); + if (val) { + char *tok = NULL; + unsigned int size = 0, groups[NGROUPS_MAX]; + + tok = strtok(val, " "); + while (tok) { + size++; + groups[size - 1] = atoi(tok); + tok = strtok(NULL, " "); + } + + if (setgroups(size, groups)) { + fprintf(stderr, "Can't set groups: %m"); + exit(1); + } } - if (setgroups(size, groups)) { - fprintf(stderr, "Can't set groups: %m"); + val = getenv("ZDTM_GID"); + if (val && (setgid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); exit(1); } - } - val = getenv("ZDTM_GID"); - if (val && (setgid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); - } - - val = getenv("ZDTM_UID"); - if (val && (setuid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); + val = getenv("ZDTM_UID"); + if (val && (setuid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); + } } if (prctl(PR_SET_DUMPABLE, 1)) { diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 0e8eeff8a3..5e849b904b 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -93,44 +93,50 @@ static int create_timens(void) int main(int argc, char **argv) { + uid_t uid; pid_t pid; int status; + uid = getuid(); + /* * pidns is used to avoid conflicts * mntns is used to mount /proc * net is used to avoid conflicts of parasite sockets */ - if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) - return 1; + if (!uid) + if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) + return 1; pid = fork(); if (pid == 0) { - if (create_timens()) - exit(1); - if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { - fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); - return 1; - } - umount2("/proc", MNT_DETACH); - umount2("/dev/pts", MNT_DETACH); - if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { - fprintf(stderr, "mount(/proc): %m"); - return 1; + if (!uid) { + if (create_timens()) + exit(1); + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { + fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); + return 1; + } + umount2("/proc", MNT_DETACH); + umount2("/dev/pts", MNT_DETACH); + if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { + fprintf(stderr, "mount(/proc): %m"); + return 1; + } + if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { + fprintf(stderr, "mount(pts): %m"); + return 1; + } + if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { + fprintf(stderr, "mount(binfmt_misc): %m"); + return 1; + } + if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { + fprintf(stderr, "mount(ptmx): %m"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; } - if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { - fprintf(stderr, "mount(pts): %m"); - return 1; - } - if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { - fprintf(stderr, "mount(binfmt_misc): %m"); - return 1; - } - if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { - fprintf(stderr, "mount(ptmx): %m"); - return 1; - } - if (system("ip link set up dev lo")) - return 1; execv(argv[1], argv + 1); fprintf(stderr, "execve: %m"); return 1; From 8cf8fe884c3ed8d73cfc9ae30722d1b2e4db51cd Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:21:51 +0000 Subject: [PATCH 094/122] non-root: add non-root test case to cirrus runs Run env00 and pthread00 test as non-root as initial proof of concept. Signed-off-by: Adrian Reber --- .cirrus.yml | 21 +++++++++++++++++++++ scripts/ci/Makefile | 5 ++++- scripts/ci/vagrant.sh | 12 ++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index c7ed5027a3..bad3a12b45 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -68,6 +68,27 @@ task: build_script: | make -C scripts/ci vagrant-fedora-rawhide +task: + name: Vagrant Fedora based test (non-root) + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: cirrus-images + image: family/docker-kvm + platform: linux + cpu: 4 + memory: 16G + nested_virtualization: true + + setup_script: | + scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + sudo kvm-ok + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + build_script: | + make -C scripts/ci vagrant-fedora-non-root + task: name: CentOS Stream 8 based test environment: diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 3a1634fb8b..30dd9ebeb8 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -97,7 +97,10 @@ vagrant-fedora-no-vdso: setup-vagrant vagrant-fedora-rawhide: setup-vagrant ./vagrant.sh fedora-rawhide -.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide +vagrant-fedora-non-root: setup-vagrant + ./vagrant.sh fedora-non-root + +.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index af0f7335ad..e23486f29e 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -68,4 +68,16 @@ fedora-rawhide() { ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } +fedora-non-root() { + ssh default uname -a + ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + # Setting the capability should be the only line needed to run as non-root on Fedora + # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu + ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' + # Run it once as non-root + ssh default 'cd /vagrant/criu; criu/criu check --unprivileged; ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' + # Run it as root with '--rootless' + ssh default 'cd /vagrant/criu; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h; sudo chmod 777 test/dump/zdtm/static/{env00,pthread00}; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' +} + $1 From 18c6426eaeebc5fe7d0f9ca0acb592a3ec828b0c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 20 Oct 2022 18:25:11 +0300 Subject: [PATCH 095/122] cgroup: add a comment to restore_cgroup_prop about path argument requirements In Virtuozzo we've faced out-of-bound access when calling this function on short path string, which corrupted other memory and lead to segmentation fault. So it may be useful to have this comment in code to avoid such a missuse of this function in future. Signed-off-by: Pavel Tikhomirov --- criu/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/criu/cgroup.c b/criu/cgroup.c index d886ce9f26..6f6117c215 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1303,6 +1303,10 @@ static int restore_perms(int fd, const char *path, CgroupPerms *perms) return 0; } +/* + * Note: The path string can be modified in this function, + * the length of path string should be at least PATH_MAX. + */ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { From 5bcde6f5a48b45014a92f7b695e860b7005a585c Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 23 Oct 2022 14:16:21 +0700 Subject: [PATCH 096/122] ipc_sysctl: Prioritize restoring IPC variables using non usernsd approach Since commit https://github.com/torvalds/linux/commit/5563cabdde, user with enough capability can open IPC sysctl files and write to them. Therefore, we don't need to use usernsd process in the outside user namespace to help with that anymore. Furthermore, some later commits: https://github.com/torvalds/linux/commit/1f5c135ee5, https://github.com/torvalds/linux/commit/0889f44e28 bind the IPC namespace to the opened file descriptor of IPC sysctl at the open() time, the changed value does not depend on the IPC namespace of write() time anymore. This breaks the current usernsd approach. So, we prioritize opening/writing IPC sysctl files in the context of restored process directly without usernsd help. This approach succeeds in the newer kernel since the restored process has enough capabilities at this restore stage. With older kernel, the open() fails and we fallback to the usernsd approach. Signed-off-by: Bui Quang Minh --- criu/include/sysctl.h | 7 ++++--- criu/ipc_ns.c | 11 ++++++++--- criu/sysctl.c | 35 +++++++++++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index ac7924dcdb..cb3eba8174 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -34,8 +34,9 @@ enum { /* * Some entries might be missing mark them as optional. */ -#define CTL_FLAGS_OPTIONAL 1 -#define CTL_FLAGS_HAS 2 -#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_OPTIONAL 1 +#define CTL_FLAGS_HAS 2 +#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_IPC_EACCES_SKIP 5 #endif /* __CR_SYSCTL_H__ */ diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c index 4fe082fbbc..7e95be8c52 100644 --- a/criu/ipc_ns.c +++ b/criu/ipc_ns.c @@ -292,6 +292,8 @@ static void pr_info_ipc_shm(const IpcShmEntry *shm) static int ipc_sysctl_req(IpcVarEntry *e, int op) { + int i; + struct sysctl_req req[] = { { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, @@ -332,6 +334,9 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op) if (e->has_shm_next_id) req[nr++] = req[16]; + for (i = 0; i < nr; i++) + req[i].flags = CTL_FLAGS_IPC_EACCES_SKIP; + return sysctl_op(req, nr, op, CLONE_NEWIPC); } @@ -570,7 +575,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { - { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, + { "kernel/sem_next_id", &sem->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct semid_ds semid; @@ -703,7 +708,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { - { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, + { "kernel/msg_next_id", &msq->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct msqid_ds msqid; @@ -841,7 +846,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { int ret, id, hugetlb_flag = 0; struct sysctl_req req[] = { - { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, + { "kernel/shm_next_id", &shm->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct shmid_ds shmid; diff --git a/criu/sysctl.c b/criu/sysctl.c index b06688712f..99026acf45 100644 --- a/criu/sysctl.c +++ b/criu/sysctl.c @@ -203,6 +203,17 @@ static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) * 2. forks a task * 3. setns()es to the UTS/IPC namespace of the caller * 4. write()s to the files and exits + * + * For the IPC namespace, since + * https://github.com/torvalds/linux/commit/5563cabdde, user with + * enough capability can open IPC sysctl files and write to it. Later + * commit https://github.com/torvalds/linux/commit/1f5c135ee5 and + * https://github.com/torvalds/linux/commit/0889f44e28 bind the IPC + * namespace at the open() time so the changed value does not depend + * on the IPC namespace at the write() time. Also, the permission check + * changes a little bit which makes the above approach unusable but we + * can simply use nonuserns version for restoring as IPC sysctl as the + * restored process currently has enough capability. */ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY); if (dir < 0) { @@ -335,9 +346,12 @@ static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) return ret; } -static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) +/* exit_code = 1 in case nonuserns failed but we want to fallback to userns approach */ +static int __nonuserns_sysctl_op(struct sysctl_req **orig_req, size_t *orig_nr_req, int op) { int ret, exit_code = -1; + struct sysctl_req *req = *orig_req; + size_t nr_req = *orig_nr_req; while (nr_req--) { int fd; @@ -351,6 +365,14 @@ static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) req++; continue; } + if (errno == EACCES && (req->flags & CTL_FLAGS_IPC_EACCES_SKIP)) { + /* The remaining requests are restored using userns approach */ + *orig_req = req; + *orig_nr_req = nr_req + 1; + exit_code = 1; + goto out; + } + pr_perror("Can't open sysctl %s", req->name); goto out; } @@ -404,7 +426,16 @@ int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns) * so we can do those in process as well. */ if (!ns || ns & CLONE_NEWNET || op == CTL_READ) - return __nonuserns_sysctl_op(req, nr_req, op); + return __nonuserns_sysctl_op(&req, &nr_req, op); + + /* Try to use nonuserns for restoring IPC sysctl and fallback to + * userns approach when the returned code is 1. + */ + if (ns & CLONE_NEWIPC && op == CTL_WRITE) { + ret = __nonuserns_sysctl_op(&req, &nr_req, op); + if (ret <= 0) + return ret; + } /* * In order to avoid lots of opening of /proc/sys for each struct sysctl_req, From 83ed54b5498a012da0082057245a8bb959ec34a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 25 Oct 2022 17:36:58 +0200 Subject: [PATCH 097/122] Switch aarch64 builds to Cirrus CI It seems like drone.io no longer provides free aarch64/armhf CI runs. This switches the aarch64 CI runs to Cirrus CI. armhf CI runs have been dropped for now as they are not directly supported. Signed-off-by: Adrian Reber --- .cirrus.yml | 37 ++++++++++++++++++++++++ .drone.yml | 82 ----------------------------------------------------- 2 files changed, 37 insertions(+), 82 deletions(-) delete mode 100644 .drone.yml diff --git a/.cirrus.yml b/.cirrus.yml index bad3a12b45..dbfb899ffc 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -148,3 +148,40 @@ task: build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" + +task: + name: aarch64 build GCC (native) + arm_container: + image: docker.io/library/ubuntu:jammy + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/apt-install make + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci local + +task: + name: aarch64 build CLANG (native) + arm_container: + image: docker.io/library/ubuntu:jammy + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/apt-install make + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci local CLANG=1 + +task: + name: aarch64 Fedora Rawhide + arm_container: + image: registry.fedoraproject.org/fedora:rawhide + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/prepare-for-fedora-rawhide.sh + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 + make -C test/zdtm -j 4 diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index 07eb8be653..0000000000 --- a/.drone.yml +++ /dev/null @@ -1,82 +0,0 @@ ---- -kind: pipeline -type: docker -name: aarch64 build GCC (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: aarch64 build CLANG (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: armhf build GCC (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: armhf build CLANG (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: aarch64 Fedora Rawhide - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: registry.fedoraproject.org/fedora:rawhide - commands: - - scripts/ci/prepare-for-fedora-rawhide.sh - - make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 - - make -C test/zdtm -j 4 From f5ad26cf7da6e3302af46ecde93d17b4239c31f5 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 14:46:33 +0700 Subject: [PATCH 098/122] cgroup-v2: Checkpoint and restore some global properties This commit supports checkpoint/restore some new global properties in cgroup-v2 cgroup.subtree_control cgroup.max.descendants cgroup.max.depth cgroup.freeze cgroup.type Only cgroup.subtree_control, cgroup.type need some more code to handle. cgroup.subtree_control value needs to be set with "+", "-" prefix and cgroup.type can only be written with value "threaded" if we want to make this controller threaded. cgroup.type is a special property because this property must be restored before any processes can move into this controller. Signed-off-by: Bui Quang Minh --- criu/cgroup-props.c | 17 ++++++ criu/cgroup.c | 108 ++++++++++++++++++++++++++++++++---- criu/include/cgroup-props.h | 1 + 3 files changed, 116 insertions(+), 10 deletions(-) diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index 5bed7dd9d9..1b85c5b5a2 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -35,12 +35,29 @@ static const char *____criu_global_props____[] = { "tasks", }; +/* cgroup2 global properties */ +// clang-format off +static const char *____criu_global_props_v2____[] = { + "cgroup.subtree_control", + "cgroup.max.descendants", + "cgroup.max.depth", + "cgroup.freeze", + "cgroup.type", +}; +// clang-format on + cgp_t cgp_global = { .name = "____criu_global_props____", .nr_props = ARRAY_SIZE(____criu_global_props____), .props = ____criu_global_props____, }; +cgp_t cgp_global_v2 = { + .name = "____criu_global_props_v2____", + .nr_props = ARRAY_SIZE(____criu_global_props_v2____), + .props = ____criu_global_props_v2____, +}; + typedef struct { struct list_head list; cgp_t cgp; diff --git a/criu/cgroup.c b/criu/cgroup.c index 6f6117c215..4f68836bee 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -441,7 +441,15 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru pr_err("dumping known properties failed\n"); return -1; } + } + /* cgroup v2 */ + if (controller->controllers[0][0] == 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global_v2) < 0) { + pr_err("dumping global properties v2 failed\n"); + return -1; + } + } else { if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { pr_err("dumping global properties failed\n"); return -1; @@ -1061,8 +1069,15 @@ static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt * it. We restore these properties as soon as the cgroup is created. */ static const char *special_props[] = { - "cpuset.cpus", "cpuset.mems", "devices.list", "memory.kmem.limit_in_bytes", - "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", NULL, + "cpuset.cpus", + "cpuset.mems", + "devices.list", + "memory.kmem.limit_in_bytes", + "memory.swappiness", + "memory.oom_control", + "memory.use_hierarchy", + "cgroup.type", + NULL, }; bool is_special_property(const char *prop) @@ -1303,6 +1318,65 @@ static int restore_perms(int fd, const char *path, CgroupPerms *perms) return 0; } +static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) +{ + char *current, *next; + size_t len, off = 0; + + current = input; + do { + next = strchrnul(current, ' '); + len = next - current; + + output[off] = prefix; + off++; + memcpy(output + off, current, len); + off += len; + output[off] = ' '; + off++; + + current = next + 1; + } while (*next != '\0'); + + return off; +} + +static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p, int fd) +{ + char buf[1024]; + char line[1024]; + int ret, off = 0; + + ret = read(fd, buf, sizeof(buf) - 1); + if (ret < 0) { + pr_perror("read from cgroup.subtree_control"); + return ret; + } + /* Remove the trailing newline */ + buf[ret] = '\0'; + + /* Remove all current subsys in subtree_control */ + if (buf[0] != '\0') + off = add_subtree_control_prop_prefix(buf, line, '-'); + + /* Add subsys need to be restored in subtree_control */ + if (cg_prop_entry_p->value[0] != '\0') + off += add_subtree_control_prop_prefix(cg_prop_entry_p->value, line + off, '+'); + + /* Remove the trailing space */ + if (off != 0) { + off--; + line[off] = '\0'; + } + + if (write(fd, line, off) != off) { + pr_perror("write to cgroup.subtree_control"); + return -1; + } + + return 0; +} + /* * Note: The path string can be modified in this function, * the length of path string should be at least PATH_MAX. @@ -1310,8 +1384,9 @@ static int restore_perms(int fd, const char *path, CgroupPerms *perms) static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { - int cg, fd, ret = -1; + int cg, fd, ret = -1, flag; CgroupPerms *perms = cg_prop_entry_p->perms; + int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); if (opts.manage_cgroups == CG_MODE_IGNORE) return 0; @@ -1328,8 +1403,13 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); + if (is_subtree_control) + flag = O_RDWR; + else + flag = O_WRONLY; + cg = get_service_fd(CGROUP_YARD); - fd = openat(cg, path, O_WRONLY); + fd = openat(cg, path, flag); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; @@ -1344,6 +1424,17 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat goto out; } + if (is_subtree_control) { + ret = restore_cgroup_subtree_control(cg_prop_entry_p, fd); + goto out; + } + + /* skip restoring cgroup.type if its value is not "threaded" */ + if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { + ret = 0; + goto out; + } + if (split_lines) { char *line = cg_prop_entry_p->value; char *next_line; @@ -1688,12 +1779,9 @@ static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux return -1; for (j = 0; j < n_controllers; j++) { - if (!strcmp(controllers[j], "cpuset") || !strcmp(controllers[j], "memory") || - !strcmp(controllers[j], "devices")) { - if (restore_special_props(paux, off2, e) < 0) { - pr_err("Restoring special cpuset props failed!\n"); - return -1; - } + if (restore_special_props(paux, off2, e) < 0) { + pr_err("Restoring special cpuset props failed!\n"); + return -1; } } } else { diff --git a/criu/include/cgroup-props.h b/criu/include/cgroup-props.h index 11b6775483..10a7061b80 100644 --- a/criu/include/cgroup-props.h +++ b/criu/include/cgroup-props.h @@ -10,6 +10,7 @@ typedef struct { } cgp_t; extern cgp_t cgp_global; +extern cgp_t cgp_global_v2; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); From 1304415e5f159d9c059c48a17f3b7f0468fcb14e Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Tue, 13 Sep 2022 21:42:48 +0700 Subject: [PATCH 099/122] zdtm: Add write_value/read_value helpers into zdtm library Add write_value/read_value helpers to write/read buffer to/from files into zdmt library. Signed-off-by: Bui Quang Minh --- test/zdtm/lib/Makefile | 2 +- test/zdtm/lib/file.c | 46 +++++++++++++++++++++++++++++++++++++ test/zdtm/lib/zdtmtst.h | 3 +++ test/zdtm/static/cgroup04.c | 20 ---------------- 4 files changed, 50 insertions(+), 21 deletions(-) create mode 100644 test/zdtm/lib/file.c diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 3ec58dfaf7..949dc123a7 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -4,7 +4,7 @@ CFLAGS += $(USERCFLAGS) LIB := libzdtmtst.a -LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c file.c PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') diff --git a/test/zdtm/lib/file.c b/test/zdtm/lib/file.c new file mode 100644 index 0000000000..57d85421d3 --- /dev/null +++ b/test/zdtm/lib/file.c @@ -0,0 +1,46 @@ +#include +#include +#include "zdtmtst.h" + +int write_value(const char *path, const char *value) +{ + int fd, l; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + l = write(fd, value, strlen(value)); + if (l < 0) { + pr_perror("failed to write %s to %s", value, path); + close(fd); + return -1; + } + + close(fd); + return 0; +} + +int read_value(const char *path, char *value, int size) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + ret = read(fd, (void *)value, size); + if (ret < 0) { + pr_perror("read %s", path); + close(fd); + return -1; + } + + value[ret] = '\0'; + close(fd); + return 0; +} diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index d91886d258..105f3c11a0 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -216,4 +216,7 @@ static inline void cleanup_closep(void *p) TEMP_FAILURE_RETRY(close(*pp)); } +extern int write_value(const char *path, const char *value); +extern int read_value(const char *path, char *value, int size); + #endif /* _VIMITESU_H_ */ diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 5a424be125..8c40ffd6bd 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -19,26 +19,6 @@ char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; -int write_value(const char *path, const char *value) -{ - int fd, l; - - fd = open(path, O_WRONLY); - if (fd < 0) { - pr_perror("open %s", path); - return -1; - } - - l = write(fd, value, strlen(value)); - close(fd); - if (l < 0) { - pr_perror("failed to write %s to %s", value, path); - return -1; - } - - return 0; -} - int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; From a8328c72a01ac825cebc974f2beb7d9e06cfdebb Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:05:10 +0700 Subject: [PATCH 100/122] zdtm: Add test to check global properties of cgroup-v2 are preserved Check that CRIU can checkpoint/restore global properties in cgroup-v2 properly. Signed-off-by: Bui Quang Minh --- test/zdtm/static/Makefile | 1 + test/zdtm/static/cgroupv2_00.c | 86 ++++++++++++++++++++++++++ test/zdtm/static/cgroupv2_00.checkskip | 11 ++++ test/zdtm/static/cgroupv2_00.desc | 1 + test/zdtm/static/cgroupv2_00.hook | 16 +++++ 5 files changed, 115 insertions(+) create mode 100644 test/zdtm/static/cgroupv2_00.c create mode 100755 test/zdtm/static/cgroupv2_00.checkskip create mode 100644 test/zdtm/static/cgroupv2_00.desc create mode 100755 test/zdtm/static/cgroupv2_00.hook diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 0ac22731b3..915e565bd0 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -386,6 +386,7 @@ TST_DIR = \ cgroup02 \ cgroup03 \ cgroup04 \ + cgroupv2_00 \ cgroup_ifpriomap \ cgroup_ignore \ cgroup_stray \ diff --git a/test/zdtm/static/cgroupv2_00.c b/test/zdtm/static/cgroupv2_00.c new file mode 100644 index 0000000000..2c6780e0ce --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.c @@ -0,0 +1,86 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that some cgroup-v2 properties in kernel controllers are preserved"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg00"; + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + int ret = -1; + + test_init(argc, argv); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + test_daemon(); + test_waitsig(); + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "cpuset\n")) { + fail("cgroup.subtree_control mismatches"); + goto out; + } + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "threaded\n")) { + fail("cgroup.type mismatches"); + goto out; + } + + pass(); + + ret = 0; + +out: + sprintf(path, "%s", dirname); + umount(path); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_00.checkskip b/test/zdtm/static/cgroupv2_00.checkskip new file mode 100755 index 0000000000..375ed35648 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc new file mode 100644 index 0000000000..4bfd4b2656 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_00.hook b/test/zdtm/static/cgroupv2_00.hook new file mode 100755 index 0000000000..1002b1ec54 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.hook @@ -0,0 +1,16 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg00" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" +echo "-cpuset" > "$tname/$cgname/cgroup.subtree_control" + +set +e +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" From c3a519272881bc36243d142427b8d9f2ddc46f82 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:31:13 +0700 Subject: [PATCH 101/122] cgroup-v2: Dump cgroup controllers of every threads in a process Currently, we assume all threads in process are in the same cgroup controllers. However, with threaded controllers, threads in a process may be in different controllers. So we need to dump cgroup controllers of every threads in process and fixup the procfs cgroup parsing to parse from self/task//cgroup. Signed-off-by: Bui Quang Minh --- criu/cgroup.c | 38 ++++++++++++++++++++--------- criu/cr-dump.c | 53 +++++++++++++++++++++++++++++++++++++++-- criu/image.c | 2 +- criu/include/cgroup.h | 8 +++++-- criu/include/parasite.h | 7 +++++- criu/parasite-syscall.c | 1 + criu/pie/parasite.c | 2 +- criu/proc_parse.c | 5 ++-- images/cgroup.proto | 1 + images/core.proto | 1 + 10 files changed, 98 insertions(+), 20 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 4f68836bee..b238b6402b 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -174,6 +174,7 @@ struct cg_controller *new_controller(const char *name) nc->n_controllers = 1; nc->n_heads = 0; + nc->is_threaded = false; INIT_LIST_HEAD(&nc->heads); return nc; @@ -371,7 +372,8 @@ static void free_all_cgroup_props(struct cgroup_dir *ncd) ncd->n_properties = 0; } -static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) +static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp, + struct cg_controller *controller) { int j; char buf[PATH_MAX]; @@ -422,6 +424,13 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; } + /* + * Set the is_threaded flag if cgroup.type's value is threaded, + * ignore all other values. + */ + if (!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) + controller->is_threaded = true; + pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; @@ -437,7 +446,7 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); - if (dump_cg_props_array(fpath, ncd, cgp) < 0) { + if (dump_cg_props_array(fpath, ncd, cgp, controller) < 0) { pr_err("dumping known properties failed\n"); return -1; } @@ -445,12 +454,12 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru /* cgroup v2 */ if (controller->controllers[0][0] == 0) { - if (dump_cg_props_array(fpath, ncd, &cgp_global_v2) < 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global_v2, controller) < 0) { pr_err("dumping global properties v2 failed\n"); return -1; } } else { - if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global, controller) < 0) { pr_err("dumping global properties failed\n"); return -1; } @@ -735,9 +744,9 @@ static int collect_cgroups(struct list_head *ctls) return 0; } -int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) +int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args, int id) { - int pid; + int pid, tid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; @@ -750,8 +759,13 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ else pid = getpid(); - pr_info("Dumping cgroups for %d\n", pid); - if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) + if (id < 0) + tid = pid; + else + tid = item->threads[id].real; + + pr_info("Dumping cgroups for thread %d\n", tid); + if (parse_thread_cgroup(pid, tid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); @@ -764,9 +778,10 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { - BUG_ON(root_cgset); - root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); + if (!root_cgset) { + root_cgset = cs; + pr_info("Set %d is root one\n", cs->id); + } } else { struct cg_ctl *root, *stray; @@ -913,6 +928,7 @@ static int dump_controllers(CgroupEntry *cg) list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); + ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 210f662323..e31b2f7028 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -759,6 +759,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item pid_t pid = item->pid->real; int ret = -1; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; + u32 *cg_set; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); @@ -804,13 +805,23 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; + strcpy(cgroup_args.thread_cgrp, "self/cgroup"); ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } - core->tc->has_cg_set = true; - ret = dump_task_cgroup(item, &core->tc->cg_set, info); + /* + * We don't support multithreads zombie tasks so there is + * no thread_core in zombie tasks, store the cg_set in + * task_core in these cases. + */ + cg_set = &core->thread_core->cg_set; + if (item->pid->state == TASK_THREAD) { + core->tc->has_cg_set = true; + cg_set = &core->tc->cg_set; + } + ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; @@ -1409,6 +1420,38 @@ static int dump_zombies(void) return ret; } +static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +{ + struct parasite_dump_cgroup_args cgroup_args, *info; + int i; + + BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); + for (i = 0; i < item->nr_threads; i++) { + CoreEntry *core = item->core[i]; + + /* Leader is already dumped */ + if (item->pid->real == item->threads[i].real) + continue; + + /* For now, we only need to dump the root task's cgroup ns, because we + * know all the tasks are in the same cgroup namespace because we don't + * allow nesting. + */ + info = NULL; + if (item->ids->has_cgroup_ns_id && !item->parent) { + info = &cgroup_args; + sprintf(cgroup_args.thread_cgrp, "self/task/%d/cgroup", item->threads[i].ns[0].virt); + if (parasite_dump_cgroup(parasite_ctl, &cgroup_args)) + return -1; + } + + if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) + return -1; + } + + return 0; +} + static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; @@ -1681,6 +1724,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; } + ret = dump_task_cgroup(parasite_ctl, item); + if (ret) { + pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); diff --git a/criu/image.c b/criu/image.c index 3c2127ac6e..9fb390ab7e 100644 --- a/criu/image.c +++ b/criu/image.c @@ -228,7 +228,7 @@ int prepare_inventory(InventoryEntry *he) if (!opts.unprivileged) he->has_root_cg_set = true; - if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) + if (dump_thread_cgroup(NULL, &he->root_cg_set, NULL, -1)) return -1; he->root_ids = crt.i.ids; diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 2e9b8933ce..5a254559d7 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -7,7 +7,7 @@ struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; -int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); +int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); int prepare_task_cgroup(struct pstree_item *); int prepare_cgroup(void); @@ -60,6 +60,9 @@ struct cg_controller { /* for cgroup list in cgroup.c */ struct list_head l; + + /* controller is a threaded cgroup or not */ + int is_threaded; }; struct cg_controller *new_controller(const char *name); @@ -87,7 +90,8 @@ struct cg_ctl { */ struct list_head; struct parasite_dump_cgroup_args; -extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); +extern int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *l, + unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); diff --git a/criu/include/parasite.h b/criu/include/parasite.h index d2a06889f6..787c927be9 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -241,7 +241,12 @@ struct parasite_dump_cgroup_args { * * The string is null terminated. */ - char contents[1 << 12]; + char contents[(1 << 12) - 32]; + /* + * Contains the path to thread cgroup procfs. + * "self/task//cgroup" + */ + char thread_cgrp[32]; }; #endif /* !__ASSEMBLY__ */ diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index ee4fa86f4f..d3541d9969 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -513,6 +513,7 @@ int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_a struct parasite_dump_cgroup_args *ca; ca = compel_parasite_args(ctl, struct parasite_dump_cgroup_args); + memcpy(ca->thread_cgrp, cgroup->thread_cgrp, sizeof(ca->thread_cgrp)); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_CGROUP, ctl); if (ret) { pr_err("Parasite failed to dump /proc/self/cgroup\n"); diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index f75fe13bb6..2303f41c39 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -745,7 +745,7 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return -1; } - cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0); + cgroup = sys_openat(proc, args->thread_cgrp, O_RDONLY, 0); sys_close(proc); if (cgroup < 0) { pr_err("can't get /proc/self/cgroup fd\n"); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 946b0fc40e..abac5908b7 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -2549,7 +2549,8 @@ int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n) return -1; } -int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) +int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *retl, + unsigned int *n) { FILE *f; int ret; @@ -2557,7 +2558,7 @@ int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct li unsigned int n_internal = 0; struct cg_ctl *intern, *ext; - f = fopen_proc(pid, "cgroup"); + f = fopen_proc(pid, "task/%d/cgroup", tid); if (!f) return -1; diff --git a/images/cgroup.proto b/images/cgroup.proto index ee03541240..5c7d16c6d0 100644 --- a/images/cgroup.proto +++ b/images/cgroup.proto @@ -24,6 +24,7 @@ message cgroup_dir_entry { message cg_controller_entry { repeated string cnames = 1; repeated cgroup_dir_entry dirs = 2; + required bool is_threaded = 3; } message cg_member_entry { diff --git a/images/core.proto b/images/core.proto index 345bdca53b..1ee32bfda9 100644 --- a/images/core.proto +++ b/images/core.proto @@ -105,6 +105,7 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; optional rseq_entry rseq_entry = 15; + required uint32 cg_set = 16; } message task_rlimits_entry { From da84213352ee4863e1739fd2b4da1e177426f98a Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:41:17 +0700 Subject: [PATCH 102/122] cgroup-v2: Restore threads in a process into correct threaded controllers As threads in a process may be in different threaded controllers, we need to move thoses threads to the correct controllers. Because the threads of a process are restored in later stage in restorer.c, we need to create a cgroupd service to help to move those threads into correct controllers when they are restored. We cannot use usernsd as the code in restorer does not know the address of outside function to pass to userns_call. However, this cgroupd service still reuses a lot of code from usernsd. The main logic is that restored threads receive the cg_set number they belong to before restorer stage in case their cg_set are different from main thread. When these threads are restored, they send the cg_set number and their thread ids through unix socket to cgroupd. cgroupd receives the cg_set number and thread ids and moves those threads into correct controllers. Thread ids are sent through SCM_CREDENTIALS of unix socket so they are translated into correct thread ids in the receiving end. Signed-off-by: Bui Quang Minh --- criu/cgroup.c | 140 +++++++++++++++++++++++++++++++++++++- criu/cr-restore.c | 19 +++++- criu/include/cgroup.h | 2 + criu/include/namespaces.h | 17 +++++ criu/include/restorer.h | 2 + criu/include/servicefd.h | 1 + criu/namespaces.c | 65 +++++++++--------- criu/pie/restorer.c | 107 +++++++++++++++++++++++++++++ 8 files changed, 319 insertions(+), 34 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index b238b6402b..918827d993 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "common/list.h" #include "xmalloc.h" @@ -55,6 +56,7 @@ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; +static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { @@ -1935,6 +1937,136 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +/* + * If a thread is a different cgroup set than the main thread in process, + * it means it is in a threaded controller. This daemon receives the cg_set + * number from the restored thread and move this thread to the correct + * cgroup controllers + */ +static int cgroupd(int sk) +{ + pr_info("cgroud: Daemon started\n"); + + while (1) { + struct unsc_msg um; + uns_call_t call; + pid_t tid; + int fd, cg_set, i; + CgSetEntry *cg_set_entry; + int ret; + + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); + ret = recvmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("cgroupd: recv req error"); + return -1; + } + + unsc_msg_pid_fd(&um, &tid, &fd); + pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); + + cg_set_entry = find_rst_set_by_id(cg_set); + if (!cg_set_entry) { + pr_err("cgroupd: No set found %d\n", cg_set); + return -1; + } + + for (i = 0; i < cg_set_entry->n_ctls; i++) { + int j, aux_off; + CgMemberEntry *ce = cg_set_entry->ctls[i]; + char aux[PATH_MAX]; + CgControllerEntry *ctrl = NULL; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + /* + * This is not a threaded controller, all threads in this + * process must be in this controller. Main thread has been + * restored, so this thread is in this controller already. + */ + if (!ctrl->is_threaded) + continue; + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); + + /* + * Cgroupd runs outside of the namespaces so we don't + * need to use userns_call here + */ + if (userns_move(aux, 0, tid)) { + pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); + return -1; + } + } + + /* + * We only want to send the cred which contains thread id back. + * The restored thread recvmsg(MSG_PEEK) until it gets its own + * thread id. + */ + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); + if (sendmsg(sk, &um.h, 0) <= 0) { + pr_perror("cgroupd: send req error"); + return -1; + } + } + + return 0; +} + +int stop_cgroupd(void) +{ + if (cgroupd_pid) { + sigset_t blockmask, oldmask; + + /* + * Block the SIGCHLD signal to avoid triggering + * sigchld_handler() + */ + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + kill(cgroupd_pid, SIGTERM); + waitpid(cgroupd_pid, NULL, 0); + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + } + + return 0; +} + +static int prepare_cgroup_thread_sfd(void) +{ + int sk; + + sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); + if (sk < 0) { + pr_err("failed to start cgroupd\n"); + return -1; + } + + if (install_service_fd(CGROUPD_SK, sk) < 0) { + kill(cgroupd_pid, SIGKILL); + waitpid(cgroupd_pid, NULL, 0); + return -1; + } + + return 0; +} + static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); @@ -2089,15 +2221,19 @@ int prepare_cgroup(void) n_controllers = ce->n_controllers; controllers = ce->controllers; - if (n_sets) + if (n_sets) { /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); - else + if (ret < 0) + return ret; + ret = prepare_cgroup_thread_sfd(); + } else { ret = 0; + } return ret; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d7d3d8edb7..78f2a9701f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1349,7 +1349,12 @@ static inline int fork_with_pid(struct pstree_item *item) return -1; item->pid->state = ca.core->tc->task_state; - rsti(item)->cg_set = ca.core->tc->cg_set; + + /* Zombie task's cg_set is stored in task_core */ + if (item->pid->state == TASK_DEAD) + rsti(item)->cg_set = ca.core->tc->cg_set; + else + rsti(item)->cg_set = ca.core->thread_core->cg_set; if (ca.core->tc->has_stop_signo) item->pid->stop_signo = ca.core->tc->stop_signo; @@ -2376,6 +2381,10 @@ static int restore_root_task(struct pstree_item *init) if (ret < 0) goto out_kill; + ret = stop_cgroupd(); + if (ret < 0) + goto out_kill; + ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; @@ -3812,6 +3821,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); + if (rsti(current)->cg_set != tcore->thread_core->cg_set) { + thread_args[i].cg_set = tcore->thread_core->cg_set; + thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); + } else { + thread_args[i].cg_set = -1; + } + ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); if (ret) goto err; @@ -3906,6 +3922,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); + close_service_fd(CGROUPD_SK); __gcov_flush(); diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 5a254559d7..93f61539cf 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -96,4 +96,6 @@ extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); +int stop_cgroupd(void); + #endif /* __CR_CGROUP_H__ */ diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index e2ea6e17f6..183a3b8526 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -1,6 +1,8 @@ #ifndef __CR_NS_H__ #define __CR_NS_H__ +#include + #include "common/compiler.h" #include "files.h" #include "common/list.h" @@ -224,4 +226,19 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + +extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); +extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); +extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); + #endif /* __CR_NS_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index d642765e3f..bc0beb5cbb 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -121,6 +121,8 @@ struct thread_restore_args { bool seccomp_force_tsync; char comm[TASK_COMM_LEN]; + int cg_set; + int cgroupd_sk; } __aligned(64); typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index c6979de7f4..4265d94edd 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -24,6 +24,7 @@ enum sfd_type { */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, + CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ USERNSD_SK, /* Socket for usernsd */ NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ diff --git a/criu/namespaces.c b/criu/namespaces.c index 286073ff6b..0dc19d5b60 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -1218,20 +1217,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) return 0; } -struct unsc_msg { - struct msghdr h; - /* - * 0th is the call address - * 1st is the flags - * 2nd is the optional (NULL in response) arguments - */ - struct iovec iov[3]; - char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; -}; - static int usernsd_pid; -static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd) +inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) { struct cmsghdr *ch; struct ucred *ucred; @@ -1269,7 +1257,10 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *)CMSG_DATA(ch); - ucred->pid = getpid(); + if (pid) + ucred->pid = *pid; + else + ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); @@ -1284,7 +1275,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void } } -static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1322,7 +1313,7 @@ static int usernsd(int sk) int flags, fd, ret; pid_t pid; - unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; @@ -1367,7 +1358,7 @@ static int usernsd(int sk) else fd = -1; - unsc_msg_init(&um, &call, &ret, NULL, 0, fd); + unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; @@ -1418,7 +1409,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Send the request */ - unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); @@ -1433,7 +1424,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Get the response back */ - unsc_msg_init(&um, &call, &res, NULL, 0, 0); + unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); @@ -1454,14 +1445,11 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, return ret; } -static int start_usernsd(void) +int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) { int sk[2]; int one = 1; - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - /* * Seqpacket to * @@ -1490,24 +1478,39 @@ static int start_usernsd(void) return -1; } - usernsd_pid = fork(); - if (usernsd_pid < 0) { - pr_perror("Can't fork usernsd"); + *pid = fork(); + if (*pid < 0) { + pr_perror("Can't unix daemon"); close(sk[0]); close(sk[1]); return -1; } - if (usernsd_pid == 0) { + if (*pid == 0) { int ret; - close(sk[0]); - ret = usernsd(sk[1]); + ret = daemon_func(sk[1]); exit(ret); } - close(sk[1]); - if (install_service_fd(USERNSD_SK, sk[0]) < 0) { + + return sk[0]; +} + +static int start_usernsd(void) +{ + int sk; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + sk = start_unix_cred_daemon(&usernsd_pid, usernsd); + if (sk < 0) { + pr_err("failed to start usernsd\n"); + return -1; + } + + if (install_service_fd(USERNSD_SK, sk) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0e98cb3dab..99cff1f7d0 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "linux/userfaultfd.h" @@ -586,6 +587,103 @@ static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sig ARCH_RT_SIGRETURN(new_sp, sigframe); } +static int send_cg_set(int sk, int cg_set) +{ + struct cmsghdr *ch; + struct msghdr h; + /* + * 0th is the dummy call address for compatibility with userns helper + * 1st is the cg_set + */ + struct iovec iov[2]; + char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {}; + int ret, *dummy = NULL; + struct ucred *ucred; + + iov[0].iov_base = &dummy; + iov[0].iov_len = sizeof(dummy); + iov[1].iov_base = &cg_set; + iov[1].iov_len = sizeof(cg_set); + + h.msg_iov = iov; + h.msg_iovlen = sizeof(iov) / sizeof(struct iovec); + h.msg_name = NULL; + h.msg_namelen = 0; + h.msg_flags = 0; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_CREDENTIALS; + + ucred = (struct ucred *)CMSG_DATA(ch); + /* + * We still have privilege in this namespace so we can send + * thread id instead of pid of main thread, uid, gid as 0 + * since these 2 are ignored in cgroupd + */ + ucred->pid = sys_gettid(); + ucred->uid = 0; + ucred->gid = 0; + + ret = sys_sendmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to send packet to cgroupd %d\n", ret); + return -1; + } + + return 0; +} + +/* + * As this socket is shared among threads, recvmsg(MSG_PEEK) + * from the socket until getting its own thread id as an + * acknowledge of successful threaded cgroup fixup + */ +static int recv_cg_set_restore_ack(int sk) +{ + struct cmsghdr *ch; + struct msghdr h = {}; + char cmsg[CMSG_SPACE(sizeof(struct ucred))]; + struct ucred *cred; + int ret; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + + while (1) { + ret = sys_recvmsg(sk, &h, MSG_PEEK); + if (ret < 0) { + pr_err("Unable to peek from cgroupd %d\n", ret); + return -1; + } + + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } + + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) + continue; + + /* + * Actual remove message from recv queue of socket + */ + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } + + break; + } + return 0; +} + /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. @@ -613,6 +711,15 @@ long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; + if (args->cg_set != -1) { + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); + if (send_cg_set(args->cgroupd_sk, args->cg_set)) + goto core_restore_end; + if (recv_cg_set_restore_ack(args->cgroupd_sk)) + goto core_restore_end; + sys_close(args->cgroupd_sk); + } + if (restore_thread_common(args)) goto core_restore_end; From 030c5ab9738a26904b820991fa0ab2c51ee5c2da Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:55:24 +0700 Subject: [PATCH 103/122] zdtm: Check threads are restored into correct threaded controllers This test creates a process with 2 threads in different threaded controllers and check if CRIU restores these threads' cgroup controllers properly. Signed-off-by: Bui Quang Minh --- test/zdtm/static/Makefile | 3 + test/zdtm/static/cgroupv2_01.c | 180 +++++++++++++++++++++++++ test/zdtm/static/cgroupv2_01.checkskip | 11 ++ test/zdtm/static/cgroupv2_01.desc | 1 + test/zdtm/static/cgroupv2_01.hook | 24 ++++ 5 files changed, 219 insertions(+) create mode 100644 test/zdtm/static/cgroupv2_01.c create mode 100755 test/zdtm/static/cgroupv2_01.checkskip create mode 100644 test/zdtm/static/cgroupv2_01.desc create mode 100755 test/zdtm/static/cgroupv2_01.hook diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 915e565bd0..edac92c83a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -387,6 +387,7 @@ TST_DIR = \ cgroup03 \ cgroup04 \ cgroupv2_00 \ + cgroupv2_01 \ cgroup_ifpriomap \ cgroup_ignore \ cgroup_stray \ @@ -679,6 +680,8 @@ sk-unix-listen02: CFLAGS += -DSK_UNIX_LISTEN02 sk-unix-listen03: CFLAGS += -DSK_UNIX_LISTEN03 sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 +cgroupv2_01: LDLIBS += -pthread + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/cgroupv2_01.c b/test/zdtm/static/cgroupv2_01.c new file mode 100644 index 0000000000..f3a6d18baf --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.c @@ -0,0 +1,180 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup-v2 threaded controllers"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg01"; + +task_waiter_t t; + +#define gettid(code) syscall(__NR_gettid) + +void cleanup(void) +{ + char path[1024]; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + rmdir(path); + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + rmdir(path); + sprintf(path, "%s/%s", dirname, cgname); + rmdir(path); + sprintf(path, "%s", dirname); + umount(path); +} + +int is_in_cgroup(char *cgname) +{ + FILE *cgf; + char buffer[1024]; + + sprintf(buffer, "/proc/self/task/%ld/cgroup", gettid()); + cgf = fopen(buffer, "r"); + if (cgf == NULL) { + pr_err("Fail to open thread's cgroup procfs\n"); + return 0; + } + + while (fgets(buffer, sizeof(buffer), cgf)) { + if (strstr(buffer, cgname)) { + fclose(cgf); + return 1; + } + } + + fclose(cgf); + return 0; +} + +void *thread_func(void *arg) +{ + char path[1024], aux[1024]; + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) { + cleanup(); + exit(1); + } + + read_value(path, aux, sizeof(aux)); + + task_waiter_complete(&t, 1); + + /* Wait for restore */ + task_waiter_wait4(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread2"); + if (!is_in_cgroup(path)) { + fail("Thread2's cgroup is not restored"); + cleanup(); + exit(1); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + pthread_t thread2; + int ret = 1; + + test_init(argc, argv); + task_waiter_init(&t); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + ret = pthread_create(&thread2, NULL, thread_func, NULL); + if (ret < 0) { + pr_err("pthread_create %s\n", strerror(ret)); + ret = 1; + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) + goto out; + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + task_waiter_complete(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread1"); + if (!is_in_cgroup(path)) { + fail("Main thread's cgroup is not restored"); + cleanup(); + exit(1); + } + pthread_join(thread2, NULL); + pass(); + + ret = 0; + +out: + cleanup(); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_01.checkskip b/test/zdtm/static/cgroupv2_01.checkskip new file mode 100755 index 0000000000..375ed35648 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc new file mode 100644 index 0000000000..4bfd4b2656 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.hook b/test/zdtm/static/cgroupv2_01.hook new file mode 100755 index 0000000000..2263fd0146 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.hook @@ -0,0 +1,24 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg01" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" + +set +e +rmdir "$tname/$cgname/thread1" + +# When the test finishes, the cleanup() function removes this directory +# successfully because the thread in this controller exit and no other +# threads belong to this controller +if [ "$1" == "--pre-restore" ]; then + rmdir "$tname/$cgname/thread2" +fi + +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" From d3ed3e90876964334526000f275c6f13cbf2e856 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Tue, 6 Sep 2022 22:04:08 +0700 Subject: [PATCH 104/122] ci: Make cpuset move to cgroup-v2 hierarchy As cgroupv2_00, cgroupv2_01 need cpuset in cgroup-v2 hierarchy to check CRIU handle cgroup-v2 properly, umount cpuset in cgroup-v1 to make it move to cgroup-v2. Signed-off-by: Bui Quang Minh --- scripts/ci/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 30dd9ebeb8..48a1e1887d 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -1,4 +1,10 @@ -local: +# Umount cpuset in cgroupv1 to make it move to cgroupv2 +cpuset-cgroupv2: + if [ -d /sys/fs/cgroup/cpuset ]; then \ + umount /sys/fs/cgroup/cpuset; \ + fi + +local: cpuset-cgroupv2 ./run-ci-tests.sh .PHONY: local From f47f5c084120d1d9dae7d19d466391e02c05fcce Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 7 Nov 2022 08:55:28 +0100 Subject: [PATCH 105/122] ci: Do not fail if latest epel repository definition is already installed Signed-off-by: Adrian Reber --- .cirrus.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index dbfb899ffc..914ceb72cc 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -104,7 +104,9 @@ task: setup_script: | ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core + # Do not fail if latest epel repository definition is already installed + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : + yum install -y dnf-plugins-core yum config-manager --set-enabled powertools yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-junit_xml xmlto alternatives --set python /usr/bin/python3 @@ -133,7 +135,8 @@ task: setup_script: | # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm + # Do not fail if latest epel repository definition is already installed + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel # Even with selinux in permissive mode the selinux tests will be executed From 979c84209bc2662b9bd2666702906de4e11190a9 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 7 Nov 2022 08:26:10 +0100 Subject: [PATCH 106/122] ci: move cgroup unmounting to run-ci-tests.sh A previous commit added a cgroup cpuset unmounting to scripts/ci/Makefile. We are sometimes running in a container without the necessary privileges to unmount certain cgroups. This commit moves the cgroup unmounting to a place in run-ci-tests.sh which already requires privileged access and does not break unprivileged build-only CI runs. Signed-off-by: Adrian Reber --- scripts/ci/Makefile | 8 +------- scripts/ci/run-ci-tests.sh | 5 +++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 48a1e1887d..30dd9ebeb8 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -1,10 +1,4 @@ -# Umount cpuset in cgroupv1 to make it move to cgroupv2 -cpuset-cgroupv2: - if [ -d /sys/fs/cgroup/cpuset ]; then \ - umount /sys/fs/cgroup/cpuset; \ - fi - -local: cpuset-cgroupv2 +local: ./run-ci-tests.sh .PHONY: local diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 1b761ea563..7b64c6b066 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -144,6 +144,11 @@ time make unittest [ -n "$SKIP_CI_TEST" ] && exit 0 +# Umount cpuset in cgroupv1 to make it move to cgroupv2 +if [ -d /sys/fs/cgroup/cpuset ]; then + umount /sys/fs/cgroup/cpuset +fi + ulimit -c unlimited cgid=$$ From 614fb7de02f72a486c532b2124a4334de3c307e5 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Tue, 8 Nov 2022 21:10:52 +0700 Subject: [PATCH 107/122] kerndat: Mark memfd_create(MFD_HUGETLB) unavailable when ENOSYS is returned Some users on Raspberry Pi report that the kerndat checking for memfd_create(MFD_HUGETLB) support returns ENOSYS even when memfd_create syscall is available. We currently treat this error as unexpected and return error. This commit marks the memfd_create(MFD_HUGETLB) as unavailable when ENOSYS is returned. Signed-off-by: Bui Quang Minh --- criu/kerndat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index a209190eea..5b567e79ff 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -500,7 +500,7 @@ static bool kerndat_has_memfd_hugetlb(void) if (ret >= 0) { kdat.has_memfd_hugetlb = true; close(ret); - } else if (ret == -1 && (errno == EINVAL || errno == ENOENT)) { + } else if (ret == -1 && (errno == EINVAL || errno == ENOENT || errno == ENOSYS)) { kdat.has_memfd_hugetlb = false; } else { pr_perror("Unexpected error from memfd_create(\"\", MFD_HUGETLB)"); From 52435d72b8bb205ec7ccfb8a988b01c9b20c3cc1 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 3 Nov 2022 22:04:53 +0700 Subject: [PATCH 108/122] cgroup: Remove redundant code that handles zombie tasks Zombie tasks are dumped in dump_zombies() so it is redundant to handle them in dump_one_task(). Deprecate cg_set in task_core_entry as this field must be per thread now. Signed-off-by: Bui Quang Minh --- criu/cr-dump.c | 9 --------- criu/cr-restore.c | 7 +++++-- images/core.proto | 1 + 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index e31b2f7028..63eb627fc2 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -811,16 +811,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item goto err; } - /* - * We don't support multithreads zombie tasks so there is - * no thread_core in zombie tasks, store the cg_set in - * task_core in these cases. - */ cg_set = &core->thread_core->cg_set; - if (item->pid->state == TASK_THREAD) { - core->tc->has_cg_set = true; - cg_set = &core->tc->cg_set; - } ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 78f2a9701f..974202f16f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1350,9 +1350,12 @@ static inline int fork_with_pid(struct pstree_item *item) item->pid->state = ca.core->tc->task_state; - /* Zombie task's cg_set is stored in task_core */ + /* + * Zombie tasks' cgroup is not dumped/restored. + * cg_set == 0 is skipped in prepare_task_cgroup() + */ if (item->pid->state == TASK_DEAD) - rsti(item)->cg_set = ca.core->tc->cg_set; + rsti(item)->cg_set = 0; else rsti(item)->cg_set = ca.core->thread_core->cg_set; diff --git a/images/core.proto b/images/core.proto index 1ee32bfda9..bc8b7a4885 100644 --- a/images/core.proto +++ b/images/core.proto @@ -40,6 +40,7 @@ message task_core_entry { optional task_timers_entry timers = 7; optional task_rlimits_entry rlimits = 8; + /* This is deprecated, should be per-thread */ optional uint32 cg_set = 9; optional signal_queue_entry signals_s = 10; From 1c6517a88cc02c46616e965e55ddf5927c3d7b74 Mon Sep 17 00:00:00 2001 From: Mathias Gibbens Date: Thu, 17 Nov 2022 22:28:57 +0000 Subject: [PATCH 109/122] Remove execute bit from source file Signed-off-by: Mathias Gibbens --- images/core-mips.proto | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 images/core-mips.proto diff --git a/images/core-mips.proto b/images/core-mips.proto old mode 100755 new mode 100644 From 7fee7d263cfd49e94fa0890f0828cb5d194cdb00 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 9 Nov 2022 11:01:29 +0000 Subject: [PATCH 110/122] amdgpu: define __nmk_dir if missing This patch adds a missing definition for `__nmk_dir` in the Makefile for the amdgpu plugin. This definition is required, for example, when building the `test_topology_remap` target: make -C plugins/amdgpu/ test_topology_remap Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 367a52c99e..64a923d388 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -12,6 +12,7 @@ LIBDRM_INC := -I/usr/include/libdrm DEPS_OK := amdgpu_plugin.so amdgpu_plugin_test DEPS_NOK := ; +__nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk CC := gcc From 6f3b81d0dad59824b60c7f90e4241aba40c2fa66 Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Mon, 21 Nov 2022 21:57:05 +0000 Subject: [PATCH 111/122] Fix warnings from -Wstrict-prototypes in clang 16.0.0 While building on a machine that has a HOL clang compiler, I ran into warnings regarding the changed line. It appears this warning is on by default because of anticipated changes to the C standard. Signed-off-by: Drew Wock --- criu/net.c | 2 +- criu/util.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/net.c b/criu/net.c index 2eff519c50..f29a166f8e 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3366,7 +3366,7 @@ int collect_net_namespaces(bool for_dump) struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net"); -struct ns_id *net_get_root_ns() +struct ns_id *net_get_root_ns(void) { static struct ns_id *root_netns = NULL; diff --git a/criu/util.c b/criu/util.c index b3b2b6659d..959e609388 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1876,7 +1876,7 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) uint64_t criu_run_id; -void util_init() +void util_init(void) { struct timespec tp; From b50d3d7b4eb92848fa7271b6f2f98b673ce6f63b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 23 Nov 2022 14:59:08 +0000 Subject: [PATCH 112/122] ci/lint: install ShellCheck with dnf The way ShellCheck is installed was changed in commit c056f99 (ci/gha/lint: install a recent shellcheck) to use the latest version v0.8.0 and remove some of the "shellcheck disable=..." annotations. Since then, Fedora 37 has been released and the ShellCheck package has been updated to v0.8.0. Signed-off-by: Radostin Stoyanov --- .github/workflows/lint.yml | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 3d42f3dcf0..4c05285e64 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,18 +9,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format - - # TODO: remove this and use ShellCheck from repo once F37 with ShellCheck 0.8.0 is out. - - name: install shellcheck - env: - VERSION: v0.8.0 - BASEURL: https://github.com/koalaman/shellcheck/releases/download - SHA256: f4bce23c11c3919c1b20bcb0f206f6b44c44e26f2bc95f8aa708716095fa0651 - run: | - curl -sSfL --retry 5 $BASEURL/$VERSION/shellcheck-$VERSION.linux.x86_64.tar.xz | - tar xfJ - -C /usr/local/bin --strip 1 shellcheck-$VERSION/shellcheck - sha256sum --strict --check - <<<"$SHA256 /usr/local/bin/shellcheck" + run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format ShellCheck - uses: actions/checkout@v2 From 5b9f7a93fc5dfc6fe1d696f5a4060ea7b3feb1f9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 23 Nov 2022 15:19:02 +0000 Subject: [PATCH 113/122] ci/alpine: remove symlink for /usr/bin/python The python3 package in Alpine has recently been updated to install symbolic link for /usr/bin/python. https://git.alpinelinux.org/aports/commit/main/python3?id=d91da210b1614eb75517d59b7f348fee01699f35 This causes the following error in CI: Step 10/11 : RUN ln -s /usr/bin/python3 /usr/bin/python ---> Running in a5a94be9dc93 ln: failed to create symbolic link '/usr/bin/python': File exists The command '/bin/sh -c ln -s /usr/bin/python3 /usr/bin/python' returned a non-zero code: 1 Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.alpine | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index cab72e8a18..eced46c22c 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -45,7 +45,4 @@ RUN adduser -u 1000 -D test RUN pip3 install junit_xml -# For zdtm we need an unversioned python binary -RUN ln -s /usr/bin/python3 /usr/bin/python - RUN make -C test/zdtm From 4f659d591fa8289f78655e88668de81ffa97f1ca Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 24 Nov 2022 10:48:35 +0000 Subject: [PATCH 114/122] ci: fix make indent This patch fixes applies the changes required by clang-format v15.0.5 for `make indent`. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/stopped03.c | 2 +- test/zdtm/static/stopped04.c | 2 +- test/zdtm/transition/maps007.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/stopped03.c b/test/zdtm/static/stopped03.c index 85c7177f78..9a373930fe 100644 --- a/test/zdtm/static/stopped03.c +++ b/test/zdtm/static/stopped03.c @@ -23,7 +23,7 @@ struct shared { futex_t fstate; int status; int code; -} * sh; +} *sh; static int new_pgrp(void) { diff --git a/test/zdtm/static/stopped04.c b/test/zdtm/static/stopped04.c index 237094ca43..9bd968aa2b 100644 --- a/test/zdtm/static/stopped04.c +++ b/test/zdtm/static/stopped04.c @@ -21,7 +21,7 @@ struct shared { futex_t fstate; int status; int code; -} * sh; +} *sh; static int new_pgrp(void) { diff --git a/test/zdtm/transition/maps007.c b/test/zdtm/transition/maps007.c index 8a605cfe03..35c196bc43 100644 --- a/test/zdtm/transition/maps007.c +++ b/test/zdtm/transition/maps007.c @@ -38,7 +38,7 @@ int main(int argc, char **argv) struct { futex_t delta; futex_t stop; - } * shm; + } *shm; uint32_t v; unsigned long long count = 0; int i; From 7819a11e9476aabf9b3b2fd6f003f8b786874c6f Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Tue, 6 Sep 2022 18:02:06 +0800 Subject: [PATCH 115/122] files-reg.c: fiemap algorithm for ghost file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to reduce the frequency of using system call, based on https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git/tree/misc/create_inode.c#n519, I created a new algorithm of dumping chunk via fiemap.(copy_file_to_chunks_fiemap) Also, I added another BOOL_OPT for users to determine which algorithm they want to use. Moreover, for those filesystem not supporting fiemap, criu will fall back to the original algorithm(SEEK_HOLE/SEEK_DATA). v2: don't call copy_chunk_from_file on outstanding extent; rearange headers to workaround "redeclaration of ‘enum fsconfig_command’" problem Signed-off-by: Liang-Chun Chen --- criu/config.c | 2 + criu/files-reg.c | 108 ++++++++++++++++++++++++++++++++++++-- criu/include/cr_options.h | 4 ++ 3 files changed, 110 insertions(+), 4 deletions(-) diff --git a/criu/config.c b/criu/config.c index 9ba79c8ef3..9f02ae9928 100644 --- a/criu/config.c +++ b/criu/config.c @@ -430,6 +430,7 @@ void init_opts(void) opts.pre_dump_mode = PRE_DUMP_SPLICE; opts.file_validation_method = FILE_VALIDATION_DEFAULT; opts.network_lock_method = NETWORK_LOCK_DEFAULT; + opts.ghost_fiemap = FIEMAP_DEFAULT; } bool deprecated_ok(char *what) @@ -701,6 +702,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), + BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), {}, }; diff --git a/criu/files-reg.c b/criu/files-reg.c index 2e3d57c5ef..13e114cea0 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -11,10 +11,13 @@ #include #include #include -#include +#include #include +#include +#include #include "tty.h" +#include "stats.h" #ifndef SEEK_DATA #define SEEK_DATA 3 @@ -32,6 +35,7 @@ */ #define BUILD_ID_MAP_SIZE 1048576 #define ST_UNIT 512 +#define EXTENT_MAX_COUNT 512 #include "cr_options.h" #include "imgset.h" @@ -221,6 +225,92 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) return 0; } +static int skip_outstanding(struct fiemap_extent *fe, size_t file_size) +{ + /* Skip outstanding extent */ + if (fe->fe_logical > file_size) + return 1; + + /* Skip outstanding part of the extent */ + if (fe->fe_logical + fe->fe_length > file_size) + fe->fe_length = file_size - fe->fe_logical; + return 0; +} + +static int copy_file_to_chunks_fiemap(int fd, struct cr_img *img, size_t file_size) +{ + GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; + struct fiemap *fiemap_buf; + struct fiemap_extent *ext_buf; + int ext_buf_size, fie_buf_size; + off_t pos = 0; + unsigned int i; + int ret = 0; + int exit_code = 0; + + ext_buf_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent); + fie_buf_size = sizeof(struct fiemap) + ext_buf_size; + + fiemap_buf = xzalloc(fie_buf_size); + if (!fiemap_buf) { + pr_perror("Out of memory when allocating fiemap"); + return -1; + } + + ext_buf = fiemap_buf->fm_extents; + fiemap_buf->fm_length = FIEMAP_MAX_OFFSET; + fiemap_buf->fm_flags |= FIEMAP_FLAG_SYNC; + fiemap_buf->fm_extent_count = EXTENT_MAX_COUNT; + + do { + fiemap_buf->fm_start = pos; + memzero(ext_buf, ext_buf_size); + ret = ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + exit_code = -EOPNOTSUPP; + } else { + exit_code = -1; + pr_perror("fiemap ioctl() failed"); + } + goto out; + } else if (fiemap_buf->fm_mapped_extents == 0) { + goto out; + } + + for (i = 0; i < fiemap_buf->fm_mapped_extents; i++) { + if (skip_outstanding(&fiemap_buf->fm_extents[i], file_size)) + continue; + + ce.len = fiemap_buf->fm_extents[i].fe_length; + ce.off = fiemap_buf->fm_extents[i].fe_logical; + + if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) { + exit_code = -1; + goto out; + } + + if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) { + exit_code = -1; + goto out; + } + + if (fiemap_buf->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) { + /* there are no extents left, break. */ + goto out; + } + } + + /* Record file's logical offset as pos */ + pos = ce.len + ce.off; + + /* Since there are still extents left, continue. */ + } while (fiemap_buf->fm_mapped_extents == EXTENT_MAX_COUNT); +out: + xfree(fiemap_buf); + return exit_code; +} + static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { int ret; @@ -913,10 +1003,20 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de goto err_out; } - if (gfe.chunks) - ret = copy_file_to_chunks(fd, img, st->st_size); - else + if (gfe.chunks) { + if (opts.ghost_fiemap) { + ret = copy_file_to_chunks_fiemap(fd, img, st->st_size); + if (ret == -EOPNOTSUPP) { + pr_debug("file system don't support fiemap\n"); + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { ret = copy_file(fd, img_raw_fd(img), st->st_size); + } + close(fd); if (ret) goto err_out; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index eacaa03a67..c7e98c756c 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -95,6 +95,9 @@ enum FILE_VALIDATION_OPTIONS { /* This constant dictates which file validation method should be tried by default. */ #define FILE_VALIDATION_DEFAULT FILE_VALIDATION_BUILD_ID +/* This constant dictates that criu use fiemap to copy ghost file by default.*/ +#define FIEMAP_DEFAULT 1 + struct irmap; struct irmap_path_opt { @@ -167,6 +170,7 @@ struct cr_options { int enable_external_masters; bool aufs; /* auto-detected, not via cli */ bool overlayfs; + int ghost_fiemap; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED bool has_binfmt_misc; /* auto-detected */ #endif From f4a91fc401c2fed040863e20eb03faa4430c9cd8 Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Sun, 11 Sep 2022 15:26:29 +0800 Subject: [PATCH 116/122] zdtm: add two tests for highly sparse ghost file ghost_multi_hole00 and ghost_multi_hole01 are tests which create a ghost file with a lot of holes, there are 4K data and 4K hole inside every 8K length. The only difference between them is ghost-fiemap option, 01 is a test for the fiemap dumping algorithm, and we want to test the behavior of EXTENT_MAX_COUNT part, so the file size should be 8M, thus there will be 1024 chunks in the ghost file. In some file system, such as xfs, we somehow can not easily create highly sparse file as in ext4 or btrfs, therefore we need `fallocate` to forcibly create holes. Signed-off-by: Liang-Chun Chen --- test/zdtm/static/Makefile | 2 + test/zdtm/static/ghost_multi_hole00.c | 122 +++++++++++++++++++++++ test/zdtm/static/ghost_multi_hole00.desc | 1 + test/zdtm/static/ghost_multi_hole01.c | 1 + test/zdtm/static/ghost_multi_hole01.desc | 1 + 5 files changed, 127 insertions(+) create mode 100644 test/zdtm/static/ghost_multi_hole00.c create mode 100644 test/zdtm/static/ghost_multi_hole00.desc create mode 120000 test/zdtm/static/ghost_multi_hole01.c create mode 100644 test/zdtm/static/ghost_multi_hole01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index edac92c83a..000488133d 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -308,6 +308,8 @@ TST_FILE = \ ghost_holes02 \ ghost_holes_large00 \ ghost_holes_large01 \ + ghost_multi_hole00 \ + ghost_multi_hole01 \ unlink_largefile \ mtime_mmap \ fifo \ diff --git a/test/zdtm/static/ghost_multi_hole00.c b/test/zdtm/static/ghost_multi_hole00.c new file mode 100644 index 0000000000..0f78d4f144 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with a lot of holes(every 8K length contains only 4K data)"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for hole size */ +#define BUFSIZE 4096 +static unsigned char buf4k[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define FILE_SIZE (1 << 23) /* 8Mb */ + +#define FILE_INTERVAL (1 << 13) /* 8Kb */ + +int main(int argc, char **argv) +{ + int fd, off; + struct stat st; + uint32_t crc; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + crc = ~0; + datagen(buf4k, BUFSIZE, &crc); + if (pwrite(fd, &buf4k, BUFSIZE, off) != BUFSIZE) { + perror("pwrite"); + goto failed; + } + + /* + * In some file system, such as xfs, + * only pwrite might not able to create highly sparse file, + * so we need to forcibly allocate hole inside the file. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off + BUFSIZE, BUFSIZE)) { + perror("fallocate"); + goto failed; + } + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("Size %u OK\n", FILE_SIZE); + + /* Data*/ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (pread(fd, buf4k, BUFSIZE, off) != BUFSIZE) { + fail("pread failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + crc = ~0; + if (datachk(buf4k, BUFSIZE, &crc)) { + fail("datachk failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + test_msg("Data @%du OK\n", off / FILE_INTERVAL); + } + + /* Hole */ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (lseek(fd, off, SEEK_HOLE) != off + BUFSIZE) { + fail("failed to find hole @ %u", off / FILE_SIZE); + goto failed; + } + test_msg("Hole @%du OK\n", off / FILE_INTERVAL); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_multi_hole00.desc b/test/zdtm/static/ghost_multi_hole00.desc new file mode 100644 index 0000000000..3981e81804 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --no-ghost-fiemap'} diff --git a/test/zdtm/static/ghost_multi_hole01.c b/test/zdtm/static/ghost_multi_hole01.c new file mode 120000 index 0000000000..c75006a6bf --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.c @@ -0,0 +1 @@ +ghost_multi_hole00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole01.desc b/test/zdtm/static/ghost_multi_hole01.desc new file mode 100644 index 0000000000..d1dc68a54d --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --ghost-fiemap'} From 50db2be1a7c5797d8a6391e2733ae379dabb67f5 Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 2 Dec 2022 01:52:20 +0530 Subject: [PATCH 117/122] Fix typo in comment Signed-off-by: Shubham Verma --- test/zdtm/static/s390x_regs_check.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/s390x_regs_check.c b/test/zdtm/static/s390x_regs_check.c index 40c480b3f2..82dca0519d 100644 --- a/test/zdtm/static/s390x_regs_check.c +++ b/test/zdtm/static/s390x_regs_check.c @@ -40,13 +40,13 @@ const char *test_author = "Michael Holzheu "; * * - Verify that "criu restore" sets the correct register sets * from "criu dump": - * $ zdtmp.py run -t zdtm/static/s390x_regs_check + * $ zdtm.py run -t zdtm/static/s390x_regs_check * * - Verify that dumpee continues running with correct registers after * parasite injection: - * $ zdtmp.py run --norst -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --norst --pre 2 -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --check-only -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst --pre 2 -t zdtm/static/s390x_regs_check + * $ zdtm.py run --check-only -t zdtm/static/s390x_regs_check */ #define NR_THREADS 2 #define NR_THREADS_ALL (NR_THREADS + 1) From b3c728617542f2a778f8f3bcda83b01257899bcb Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 14 Nov 2022 09:23:00 -0800 Subject: [PATCH 118/122] non-root: Rework socket bufs for unprivileged mode SO_SNDBUFFORCE/SO_RCVBUFFORCE require root or CAP_NET_ADMIN. We can use SO_SNDBUF/SO_RCVBUF in some cases and avoid needing elevated privileges. This patch renames sk_setbufs() to sk_setbufs_ns() and makes sk_setbufs() a general helper that sets socket send and receive buffer sizes. The helper tries to use SO_SNDBUFFORCE/SO_RCVBUFFORCE first and falls back to SO_SNDBUF/SO_RCVBUF if we're in unprivileged mode. The existing sk_setbufs_ns() which takes a pid parameter and is intended to be called via userns_call() is rewritten to call sk_setbufs(). Existing code that sets buffer sizes via setsockopt() is modified to call sk_setbufs() instead. Signed-off-by: Younes Manton --- criu/fdstore.c | 15 ++------------- criu/include/sockets.h | 1 + criu/pidfd-store.c | 5 ++--- criu/sockets.c | 32 ++++++++++++++++++++++++-------- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/criu/fdstore.c b/criu/fdstore.c index 03afa9f178..d615ad15d0 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -15,6 +15,7 @@ #include "util.h" #include "cr_options.h" #include "util-caps.h" +#include "sockets.h" /* clang-format off */ static struct fdstore_desc { @@ -29,8 +30,6 @@ int fdstore_init(void) uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 }; struct sockaddr_un addr; unsigned int addrlen; - int rcv_opt_name; - int snd_opt_name; struct stat st; int sk, ret; @@ -53,17 +52,7 @@ int fdstore_init(void) return -1; } - if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { - rcv_opt_name = SO_RCVBUFFORCE; - snd_opt_name = SO_SNDBUFFORCE; - } else { - rcv_opt_name = SO_RCVBUF; - snd_opt_name = SO_SNDBUF; - } - - if (setsockopt(sk, SOL_SOCKET, snd_opt_name, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, rcv_opt_name, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(sk, buf)) { close(sk); return -1; } diff --git a/criu/include/sockets.h b/criu/include/sockets.h index 399d38664c..c3e7c879a7 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -27,6 +27,7 @@ struct socket_desc { extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index b15568e08e..9fdc74cb74 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -13,6 +13,7 @@ #include "log.h" #include "util.h" #include "pidfd-store.h" +#include "sockets.h" struct pidfd_entry { pid_t pid; @@ -94,9 +95,7 @@ int init_pidfd_store_sk(pid_t pid, int sk) * This is similar to how fdstore_init() works. */ if (addrlen == sizeof(sa_family_t)) { - if (setsockopt(pidfd_store_sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(pidfd_store_sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(pidfd_store_sk, buf)) { goto err; } diff --git a/criu/sockets.c b/criu/sockets.c index db772707b6..7708344d6a 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -29,6 +29,7 @@ #include "pstree.h" #include "util.h" #include "fdstore.h" +#include "cr_options.h" #undef LOG_PREFIX #define LOG_PREFIX "sockets: " @@ -465,18 +466,33 @@ int do_restore_opt(int sk, int level, int name, void *val, int len) return 0; } -static int sk_setbufs(void *arg, int fd, pid_t pid) +int sk_setbufs(int sk, uint32_t *bufs) { - u32 *buf = (u32 *)arg; + uint32_t sndbuf = bufs[0], rcvbuf = bufs[1]; - if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0])) - return -1; - if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1])) - return -1; + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &rcvbuf, sizeof(rcvbuf))) { + if (opts.unprivileged) { + pr_info("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE, falling back to SO_SNDBUF/SO_RCVBUF\n"); + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf))) { + pr_perror("Unable to set socket SO_SNDBUF/SO_RCVBUF"); + return -1; + } + } else { + pr_perror("Unable to set socket SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + return -1; + } + } return 0; } +static int sk_setbufs_ns(void *arg, int fd, pid_t pid) +{ + return sk_setbufs(fd, (uint32_t *)arg); +} + /* * Set sizes of buffers to maximum and prevent blocking * Caller of this fn should call other socket restoring @@ -489,7 +505,7 @@ int restore_prepare_socket(int sk) /* In kernel a bufsize has type int and a value is doubled. */ u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 }; - if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk)) + if (userns_call(sk_setbufs_ns, 0, maxbuf, sizeof(maxbuf), sk)) return -1; /* Prevent blocking on restore */ @@ -517,7 +533,7 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ - ret |= userns_call(sk_setbufs, 0, bufs, sizeof(bufs), sk); + ret |= userns_call(sk_setbufs_ns, 0, bufs, sizeof(bufs), sk); if (soe->has_so_buf_lock) { pr_debug("\trestore buf_lock %d for socket\n", soe->so_buf_lock); From 318ff086198037fbbea24007935a0da4d6b70f44 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 21 Nov 2022 11:14:20 -0800 Subject: [PATCH 119/122] non-root: Don't dump socket option SO_MARK if 0 Restoring SO_MARK requires root or CAP_NET_ADMIN. If the value is 0 we will avoid dumping it so that we don't need to do a privileged call on restore. Signed-off-by: Younes Manton --- criu/sockets.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/criu/sockets.c b/criu/sockets.c index 7708344d6a..c99fc7b50d 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -647,8 +647,13 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); soe->has_so_rcvlowat = true; ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); - soe->has_so_mark = true; + /* + * Restoring SO_MARK requires root or CAP_NET_ADMIN. Avoid saving it + * in unprivileged mode if still has its default value. + */ ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); + if (soe->so_mark != 0) + soe->has_so_mark = true; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; From 6e11e7f081828edb1bad7e83ea566cf197c2e514 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 12 Dec 2022 09:32:58 -0800 Subject: [PATCH 120/122] sockets: tiny style fix Signed-off-by: Andrei Vagin --- criu/sockets.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/sockets.c b/criu/sockets.c index c99fc7b50d..d17e0a9869 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -652,8 +652,7 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) * in unprivileged mode if still has its default value. */ ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); - if (soe->so_mark != 0) - soe->has_so_mark = true; + soe->has_so_mark = !!soe->so_mark; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; From 008c2b9c7f86a5e27fc721a28e3669134ef98969 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 13 Dec 2022 09:39:49 -0800 Subject: [PATCH 121/122] test/javaTests: update org.testng:testng (Maven) TestNG is vulnerable to Path Traversal Fixes https://github.com/checkpoint-restore/criu/security/dependabot/1. Signed-off-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.hotspot-alpine | 2 +- scripts/build/Dockerfile.hotspot-ubuntu | 2 +- scripts/build/Dockerfile.openj9-ubuntu | 2 +- test/javaTests/pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index d6e6e51308..cb9332fd0c 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:8-alpine +FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc RUN apk update && apk add \ diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 8936adf815..350102818b 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:8-focal +FROM docker.io/library/eclipse-temurin:11-focal ARG CC=gcc COPY scripts/ci/apt-install /bin/apt-install diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 2e35358ff5..23db14e8df 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-8-jdk-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal ARG CC=gcc COPY scripts/ci/apt-install /bin/apt-install diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml index faae44d1bf..ddb6c89cf1 100644 --- a/test/javaTests/pom.xml +++ b/test/javaTests/pom.xml @@ -38,7 +38,7 @@ org.testng testng - 6.3.1 + 7.7.0 From 972ae4e7f771b8a47a86a24a589c446e1be19205 Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Thu, 15 Dec 2022 16:13:45 +0000 Subject: [PATCH 122/122] link-remap: Add --keep-link-remaps option When specified, this option disables the automatic deletion of link-remaps on restore. This allows checkpoints dumped with --link-remap to be restored multiple times (provided that other conditions for reuse are met). Signed-off-by: Drew Wock --- criu/config.c | 1 + criu/crtools.c | 1 + criu/files-reg.c | 3 +++ criu/include/cr_options.h | 6 ++++++ 4 files changed, 11 insertions(+) diff --git a/criu/config.c b/criu/config.c index 9f02ae9928..234af2f21a 100644 --- a/criu/config.c +++ b/criu/config.c @@ -703,6 +703,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT("keep-link-remaps", &opts.keep_link_remaps), {}, }; diff --git a/criu/crtools.c b/criu/crtools.c index ac05bc8215..832a36d596 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -445,6 +445,7 @@ int main(int argc, char *argv[], char *envp[]) " --evasive-devices use any path to a device file if the original one\n" " is inaccessible\n" " --link-remap allow one to link unlinked files back when possible\n" + " --keep-link-remaps On restore, don't automatically remove link remaps.\n" " --ghost-limit size limit max size of deleted file contents inside image\n" " --action-script FILE add an external action script\n" " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" diff --git a/criu/files-reg.c b/criu/files-reg.c index 13e114cea0..62da91be2d 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -895,6 +895,9 @@ int try_clean_remaps(bool only_ghosts) struct remap_info *ri; int ret = 0; + if (opts.keep_link_remaps) + return ret; + list_for_each_entry(ri, &remaps, list) { if (ri->rpe->remap_type == REMAP_TYPE__GHOST) ret |= clean_one_remap(ri); diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index c7e98c756c..e19ca370f6 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -236,6 +236,12 @@ struct cr_options { * explicitly request it as it comes with many limitations. */ int unprivileged; + + /* + * On restore, do not remove link-remaps. This allows a checkpoint taken with --link-remap + * to be reused. + */ + int keep_link_remaps; }; extern struct cr_options opts;