diff --git a/.cirrus.yml b/.cirrus.yml index 2b6903ddc5..914ceb72cc 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -19,6 +19,34 @@ task: build_script: | make -C scripts/ci vagrant-fedora-no-vdso +task: + name: CentOS Stream 9 based test + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: centos-cloud + image: family/centos-stream-9 + platform: linux + cpu: 4 + memory: 8G + + setup_script: | + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + dnf config-manager --set-enabled crb # Same as CentOS 8 powertools + dnf -y install epel-release epel-next-release + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python-flake8 xmlto + systemctl stop sssd + # Even with selinux in permissive mode the selinux tests will be executed. + # The Cirrus CI user runs as a service from selinux point of view and is + # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0). + # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode. + setenforce 0 + + build_script: | + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" + task: name: Vagrant Fedora Rawhide based test environment: @@ -41,7 +69,28 @@ task: make -C scripts/ci vagrant-fedora-rawhide task: - name: CentOS 8 based test + name: Vagrant Fedora based test (non-root) + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: cirrus-images + image: family/docker-kvm + platform: linux + cpu: 4 + memory: 16G + nested_virtualization: true + + setup_script: | + scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + sudo kvm-ok + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + build_script: | + make -C scripts/ci vagrant-fedora-non-root + +task: + name: CentOS Stream 8 based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" @@ -55,9 +104,11 @@ task: setup_script: | ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core + # Do not fail if latest epel repository definition is already installed + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : + yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-junit_xml xmlto alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed @@ -65,7 +116,6 @@ task: # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode setenforce 0 - pip3 install junit_xml build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" @@ -84,6 +134,9 @@ task: memory: 8G setup_script: | + # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. + # Do not fail if latest epel repository definition is already installed + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel # Even with selinux in permissive mode the selinux tests will be executed @@ -98,3 +151,40 @@ task: build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" + +task: + name: aarch64 build GCC (native) + arm_container: + image: docker.io/library/ubuntu:jammy + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/apt-install make + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci local + +task: + name: aarch64 build CLANG (native) + arm_container: + image: docker.io/library/ubuntu:jammy + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/apt-install make + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci local CLANG=1 + +task: + name: aarch64 Fedora Rawhide + arm_container: + image: registry.fedoraproject.org/fedora:rawhide + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/prepare-for-fedora-rawhide.sh + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 + make -C test/zdtm -j 4 diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index 07eb8be653..0000000000 --- a/.drone.yml +++ /dev/null @@ -1,82 +0,0 @@ ---- -kind: pipeline -type: docker -name: aarch64 build GCC (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: aarch64 build CLANG (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: armhf build GCC (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: armhf build CLANG (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: aarch64 Fedora Rawhide - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: registry.fedoraproject.org/fedora:rawhide - commands: - - scripts/ci/prepare-for-fedora-rawhide.sh - - make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 - - make -C test/zdtm -j 4 diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 00bc3b2bda..b6d94d23ed 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -9,4 +9,8 @@ jobs: steps: - uses: actions/checkout@v2 - name: Run Fedora Rawhide Test - run: sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" + # We need to pass environment variables from the CI environment to + # distinguish between CI environments. However, we need to make sure that + # XDG_RUNTIME_DIR environment variable is not set due to a bug in Podman. + # FIXME: https://github.com/containers/podman/issues/14920 + run: sudo -E XDG_RUNTIME_DIR= make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" diff --git a/.github/workflows/openj9-test.yml b/.github/workflows/java-test.yml similarity index 54% rename from .github/workflows/openj9-test.yml rename to .github/workflows/java-test.yml index 1d7a1eb6b7..211953495b 100644 --- a/.github/workflows/openj9-test.yml +++ b/.github/workflows/java-test.yml @@ -1,4 +1,4 @@ -name: OpenJ9 Test +name: Java Test on: [push, pull_request] @@ -7,5 +7,5 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - - name: Run OpenJ9 Test - run: sudo make -C scripts/ci openj9-test + - name: Run Java Test + run: sudo make -C scripts/ci java-test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c1215aeafe..4c05285e64 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,13 +9,25 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell + run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format ShellCheck + - uses: actions/checkout@v2 + + - name: Set git safe directory + # https://github.com/actions/checkout/issues/760 + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Run make lint run: make lint + - name: Run make indent run: > - make indent && + if [ -z "${{github.base_ref}}" ]; then + make indent + else + git fetch origin ${{github.base_ref}} && + git clang-format --style file --extensions c,h --quiet origin/${{github.base_ref}} + fi && STATUS=$(git status --porcelain) && if [ ! -z "$STATUS" ]; then echo "FAIL: some files are not correctly formatted."; diff --git a/.gitignore b/.gitignore index d5135f5f8d..23894d631b 100644 --- a/.gitignore +++ b/.gitignore @@ -42,4 +42,4 @@ lib/.crit-setup.files compel/include/asm include/common/asm include/common/config.h -build/ +build/** diff --git a/Documentation/compel.txt b/Documentation/compel.txt index a44ca22c66..506228f592 100644 --- a/Documentation/compel.txt +++ b/Documentation/compel.txt @@ -97,7 +97,10 @@ Following steps are performed to infect the victim process: - execute system call: *int compel_syscall(ctl, int syscall_nr, long *ret, int arg ...);* - infect victim: *int compel_infect(ctl, nr_thread, size_of_args_area);* - cure the victim: *int compel_cure(ctl);* //ctl pointer is freed by this call - - Resume victim: *int compel_resume_task(pid, orig_state, state);* + - Resume victim: *int compel_resume_task(pid, orig_state, state)* or + *int compel_resume_task_sig(pid, orig_state, state, stop_signo).* + //compel_resume_task_sig() could be used in case when victim is in stopped state. + stop_signo could be read by calling compel_parse_stop_signo(). *ctl* must be configured with blob information by calling *PREFIX_setup_c_header()*, with ctl as its argument. *PREFIX* is the argument given to *-p* when calling hgen, else it is deduced from file name. diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 8b128f63ee..3b68f16a4c 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,12 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty +*--unprivileged*:: + This option tells *criu* to accept the limitations when running + as non-root. Running as non-root requires *criu* at least to have + *CAP_SYS_ADMIN* or *CAP_CHECKPOINT_RESTORE*. For details about running + *criu* as non-root please consult the *NON-ROOT* section. + *-V*, *--version*:: Print program version and exit. @@ -668,6 +674,9 @@ The 'mode' may be one of the following: build-ID cannot be obtained, 'chksm-first' method will be used. This is the default if mode is unspecified. +*--skip-file-rwx-check*:: + Skip checking file permissions (r/w/x for u/g/o) on restore. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to @@ -874,6 +883,32 @@ configuration file will overwrite all other configuration file settings or RPC options. *This can lead to undesired behavior of criu and should only be used carefully.* +NON-ROOT +-------- +*criu* can be used as non-root with either the *CAP_SYS_ADMIN* capability +or with the *CAP_CHECKPOINT_RESTORE* capability introduces in Linux kernel 5.9. +*CAP_CHECKPOINT_RESTORE* is the minimum that is required. + +*criu* also needs either *CAP_SYS_PTRACE* or a value of 0 in +*/proc/sys/kernel/yama/ptrace_scope* (see *ptrace*(2)) to be able to interrupt +the process for dumping. + +Running *criu* as non-root has many limitations and depending on the process +to checkpoint and restore it may not be possible. + +In addition to *CAP_CHECKPOINT_RESTORE* it is possible to give *criu* additional +capabilities to enable additional features in non-root mode. + +Currently *criu* can benefit from the following additional capabilities: + + - *CAP_NET_ADMIN* + - *CAP_SYS_CHROOT* + - *CAP_SETUID* + - *CAP_SYS_RESOURCE* + +Independent of the capabilities it is always necessary to use "*--unprivileged*" to +accept *criu*'s limitation in non-root mode. + EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into diff --git a/MAINTAINERS b/MAINTAINERS index bb153f1ab0..8fee8e5715 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4,3 +4,5 @@ Mike Rapoport Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov +Radostin Stoyanov +Alexander Mikhalitsyn diff --git a/Makefile b/Makefile index ad70800eb5..6bb1497b37 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: flog criu lib crit .PHONY: all # @@ -233,6 +233,15 @@ soccr/built-in.o: $(CONFIG_HEADER) .FORCE $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) +#flog gets used by criu, build it earlier + +flogMakefile: ; +flog%: + $(Q) $(MAKE) $(build)=flog $@ +flog: + $(Q) $(MAKE) $(build)=flog all +.PHONY: flog + # # CRIU building done in own directory # with slightly different rules so we @@ -275,6 +284,7 @@ lib: crit clean mrproper: $(Q) $(MAKE) $(build)=images $@ + $(Q) $(MAKE) $(build)=flog $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ @@ -418,14 +428,15 @@ lint: flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py flake8 --config=scripts/flake8.cfg scripts/criu-ns + flake8 --config=scripts/flake8.cfg scripts/crit-setup.py flake8 --config=scripts/flake8.cfg coredump/ shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install - shellcheck test/others/crit/*.sh - shellcheck test/others/libcriu/*.sh - shellcheck test/others/crit/*.sh test/others/criu-coredump/*.sh - shellcheck test/others/config-file/*.sh + shellcheck -x test/others/crit/*.sh + shellcheck -x test/others/libcriu/*.sh + shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh + shellcheck -x test/others/config-file/*.sh codespell # Do not append \n to pr_perror or fail ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' diff --git a/Makefile.config b/Makefile.config index d46d84f2de..270ec61c0f 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,7 +78,8 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE OPENAT2 + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW MEMFD_CREATE \ + OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test diff --git a/Makefile.install b/Makefile.install index aafb954697..c798637beb 100644 --- a/Makefile.install +++ b/Makefile.install @@ -7,7 +7,7 @@ MANDIR ?= $(PREFIX)/share/man INCLUDEDIR ?= $(PREFIX)/include LIBEXECDIR ?= $(PREFIX)/libexec RUNDIR ?= /run -PLUGINDIR ?= /var/lib/criu +PLUGINDIR ?= $(PREFIX)/lib/criu # # For recent Debian/Ubuntu with multiarch support. diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 5f090490d9..8a61b268f8 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,14 +2,41 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} +#include +#include + +struct hwbp_cap { + char arch; + char bp_count; +}; + +/* copied from `linux/arch/arm64/include/asm/hw_breakpoint.h` */ +/* Lengths */ +#define ARM_BREAKPOINT_LEN_1 0x1 +#define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 +#define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f +#define ARM_BREAKPOINT_LEN_8 0xff + +/* Privilege Levels */ +#define AARCH64_BREAKPOINT_EL1 1 +#define AARCH64_BREAKPOINT_EL0 2 + +/* Breakpoint */ +#define ARM_BREAKPOINT_EXECUTE 0 + +/* Watchpoints */ +#define ARM_BREAKPOINT_LOAD 1 +#define ARM_BREAKPOINT_STORE 2 +#define AARCH64_ESR_ACCESS_MASK (1 << 6) + +#define DISABLE_HBP 0 +#define ENABLE_HBP 1 + +int ptrace_set_breakpoint(pid_t pid, void *addr); +int ptrace_flush_breakpoints(pid_t pid); #endif diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index bd1ed0da35..d0189f0039 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,7 +2,9 @@ #include #include #include +#include #include + #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" @@ -10,6 +12,7 @@ #include "errno.h" #include "infect.h" #include "infect-priv.h" +#include "asm/breakpoints.h" unsigned __page_size = 0; unsigned __page_shift = 0; @@ -176,3 +179,111 @@ unsigned long compel_task_size(void) break; return task_size; } + +static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) +{ + static struct hwbp_cap info; + static int available = -1; + + if (available == -1) { + unsigned int val; + struct iovec iovec = { + .iov_base = &val, + .iov_len = sizeof(val), + }; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_HW_BREAK, &iovec) < 0) + available = 0; + else { + info.arch = (char)((val >> 8) & 0xff); + info.bp_count = (char)(val & 0xff); + + available = (info.arch != 0); + } + } + + return available == 1 ? &info : NULL; +} + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + k_rtsigset_t block; + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + /* + * The struct is copied from `arch/arm64/include/asm/hw_breakpoint.h` in + * linux kernel: + * struct arch_hw_breakpoint_ctrl { + * __u32 __reserved : 19, + * len : 8, + * type : 2, + * privilege : 2, + * enabled : 1; + * }; + * + * The part of `struct arch_hw_breakpoint_ctrl` bits meaning is defined + * in <>, + * D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers. + */ + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | ENABLE_HBP; + regs.dbg_regs[0].addr = (__u64)addr; + regs.dbg_regs[0].ctrl = ctrl; + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } + + if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { + pr_perror("Unable to restart the stopped tracee process %d", pid); + return -1; + } + + return 1; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | DISABLE_HBP; + regs.dbg_regs[0].addr = 0ul; + regs.dbg_regs[0].ctrl = ctrl; + + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + return 0; +} diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index eb12c9f7cd..8cc94ba740 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -23,6 +23,11 @@ /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ #define USER_REDZONE_SIZE 512 +#if _CALL_ELF != 2 +#error Only supporting ABIv2. +#else +#define STACK_FRAME_MIN_SIZE 32 +#endif /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ #define TRAMP_SIZE 6 diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 61cd6e9857..db999ce37f 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -441,13 +441,13 @@ void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* - * OpenPOWER ABI requires that r12 is set to the calling function addressi + * OpenPOWER ABI requires that r12 is set to the calling function address * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) - regs->gpr[1] = (unsigned long)stack; + regs->gpr[1] = (unsigned long)stack - STACK_FRAME_MIN_SIZE; regs->trap = 0; } diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 98e2512e7c..01959b95b2 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -245,6 +245,19 @@ static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) return 0; } +static inline void fixup_mxcsr(struct xsave_struct *xsave) +{ + /* + * Right now xsave->i387.mxcsr filled with the random garbage, + * let's make it valid by applying mask which allows all + * features, except the denormals-are-zero feature bit. + * + * See also fpu__init_system_mxcsr function: + * https://github.com/torvalds/linux/blob/8cb1ae19/arch/x86/kernel/fpu/init.c#L117 + */ + xsave->i387.mxcsr &= 0x0000ffbf; +} + /* See arch/x86/kernel/fpu/xstate.c */ static void validate_random_xstate(struct xsave_struct *xsave) { @@ -272,17 +285,6 @@ static void validate_random_xstate(struct xsave_struct *xsave) /* No reserved bits may be set */ memset(&hdr->reserved, 0, sizeof(hdr->reserved)); - - /* - * While using PTRACE_SETREGSET the kernel checks that - * "Reserved bits in MXCSR must be zero." - * if (mxcsr[0] & ~mxcsr_feature_mask) - * return -EINVAL; - * - * As the mxcsr_feature_mask depends on the CPU the easiest solution for - * this error injection test is to set mxcsr just to zero. - */ - xsave->i387.mxcsr = 0; } /* @@ -309,6 +311,8 @@ static int corrupt_extregs(pid_t pid) */ pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); + fixup_mxcsr(&ext_regs); + if (!use_xsave) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); @@ -584,6 +588,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) int ptrace_set_breakpoint(pid_t pid, void *addr) { + k_rtsigset_t block; int ret; /* Set a breakpoint */ @@ -599,6 +604,16 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) return -1; } + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); diff --git a/compel/include/ptrace.h b/compel/include/ptrace.h index bf2701e632..00013f9370 100644 --- a/compel/include/ptrace.h +++ b/compel/include/ptrace.h @@ -5,6 +5,8 @@ #include #include +#define PTRACE_SYSCALL_TRAP 0x80 + #define PTRACE_SI_EVENT(_si_code) (((_si_code)&0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 3040a67a78..3bd36dda15 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -18,6 +18,7 @@ extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; unsigned long long shdpnd; + unsigned long long sigblk; char state; int vpid; int ppid; @@ -30,7 +31,9 @@ extern int __must_check compel_wait_task(int pid, int ppid, struct seize_task_status *st, void *data); extern int __must_check compel_stop_task(int pid); +extern int __must_check compel_parse_stop_signo(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); +extern int compel_resume_task_sig(pid_t pid, int orig_state, int state, int stop_signo); struct parasite_ctl; struct parasite_thread_ctl; @@ -38,9 +41,12 @@ struct parasite_thread_ctl; extern struct parasite_ctl __must_check *compel_prepare(int pid); extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +extern int __must_check compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, + unsigned long args_size); extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); +extern int __must_check compel_start_daemon(struct parasite_ctl *ctl); extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); @@ -77,9 +83,9 @@ enum trace_flags { TRACE_EXIT, }; -extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat, enum trace_flags trace); +extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat); -extern int __must_check compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); +extern int __must_check compel_stop_pie(pid_t pid, void *addr, bool no_bp); extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); @@ -174,4 +180,6 @@ extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); +extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); + #endif diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index c78c02a6a0..5aab7aa3ee 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -92,6 +92,12 @@ static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(aux, "SigBlk:", 7)) { + if (sscanf(aux + 7, "%llx", &ss->sigblk) != 1) + goto err_parse; + + continue; + } } fclose(f); @@ -186,6 +192,29 @@ static int skip_sigstop(int pid, int nr_signals) return 0; } +#define SIG_MASK(sig) (1ULL << ((sig)-1)) + +#define SIG_IN_MASK(sig, mask) ((sig) > 0 && (sig) <= SIGMAX && (SIG_MASK(sig) & (mask))) + +#define SUPPORTED_STOP_MASK ((1ULL << (SIGSTOP - 1)) | (1ULL << (SIGTSTP - 1))) + +static inline int sig_stop(int sig) +{ + return SIG_IN_MASK(sig, SUPPORTED_STOP_MASK); +} + +int compel_parse_stop_signo(int pid) +{ + siginfo_t si; + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si) < 0) { + pr_perror("SEIZE %d: can't parse stopped siginfo", pid); + return -1; + } + + return si.si_signo; +} + /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace @@ -198,7 +227,7 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ void *data) { siginfo_t si; - int status, nr_sigstop; + int status, nr_stopsig; int ret = 0, ret2, wait_errno = 0; /* @@ -275,6 +304,11 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ goto try_again; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } + if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; @@ -291,17 +325,32 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ goto err; } - nr_sigstop = 0; - if (ss->sigpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (ss->shdpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (si.si_signo == SIGSTOP) - nr_sigstop++; + nr_stopsig = 0; + if (SIG_IN_MASK(SIGSTOP, ss->sigpnd)) + nr_stopsig++; + if (SIG_IN_MASK(SIGSTOP, ss->shdpnd)) + nr_stopsig++; + + if (SIG_IN_MASK(SIGTSTP, ss->sigpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + if (SIG_IN_MASK(SIGTSTP, ss->shdpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; - if (nr_sigstop) { - if (skip_sigstop(pid, nr_sigstop)) - goto err_stop; + if (sig_stop(si.si_signo)) + nr_stopsig++; + + if (nr_stopsig) { + if (skip_sigstop(pid, nr_stopsig)) { + /* + * Make sure that the task is stopped by a supported stop signal and + * send it again to restore task state before criu intervention. + */ + if (sig_stop(si.si_signo)) + kill(pid, si.si_signo); + else + kill(pid, SIGSTOP); + goto err; + } return COMPEL_TASK_STOPPED; } @@ -313,8 +362,6 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ goto err; } -err_stop: - kill(pid, SIGSTOP); err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); @@ -322,6 +369,11 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ } int compel_resume_task(pid_t pid, int orig_st, int st) +{ + return compel_resume_task_sig(pid, orig_st, st, SIGSTOP); +} + +int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) { int ret = 0; @@ -345,8 +397,18 @@ int compel_resume_task(pid_t pid, int orig_st, int st) * task with STOP in queue that would get lost after * detach, so stop it again. */ - if (orig_st == COMPEL_TASK_STOPPED) - kill(pid, SIGSTOP); + if (orig_st == COMPEL_TASK_STOPPED) { + /* + * Check that stop_signo contain supported stop signal. + * If it isn't, then send SIGSTOP. It makes sense in the case + * when we get COMPEL_TASK_STOPPED from old image, + * where stop_signo was not yet supported. + */ + if (sig_stop(stop_signo)) + kill(pid, stop_signo); + else + kill(pid, SIGSTOP); + } } else { pr_err("Unknown final state %d\n", st); ret = -1; @@ -905,7 +967,7 @@ static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) return ret; } -int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; @@ -1017,15 +1079,23 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l goto err; } - if (parasite_start_daemon(ctl)) - goto err; - return 0; err: return -1; } +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +{ + if (compel_infect_no_daemon(ctl, nr_threads, args_size)) + return -1; + + if (parasite_start_daemon(ctl)) + return -1; + + return 0; +} + struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; @@ -1309,7 +1379,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; - enum trace_flags flag; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) @@ -1350,14 +1419,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return -1; /* Go to sigreturn as closer as we can */ - ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ret = compel_stop_pie(pid, ctl->sigreturn_addr, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; - if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag)) - return -1; - - if (ptrace_flush_breakpoints(pid)) + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; /* @@ -1369,6 +1435,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; } +int compel_start_daemon(struct parasite_ctl *ctl) +{ + return parasite_start_daemon(ctl); +} + int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { @@ -1489,7 +1560,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) if (ret) goto err; - ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1), TRACE_ENTER); + ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1)); /* * Don't touch extended registers here: they were restored @@ -1501,7 +1572,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) return ret; } -int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) +int compel_stop_pie(pid_t pid, void *addr, bool no_bp) { int ret; @@ -1518,7 +1589,6 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ - *tf = TRACE_EXIT; return 0; } @@ -1531,14 +1601,12 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) pr_perror("Unable to restart the %d process", pid); return -1; } - - *tf = TRACE_ENTER; return 0; } static bool task_is_trapped(int status, pid_t pid) { - if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) + if (WIFSTOPPED(status) && (WSTOPSIG(status) & ~PTRACE_SYSCALL_TRAP) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); @@ -1572,15 +1640,13 @@ static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ -int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, enum trace_flags trace) +int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) { + enum trace_flags trace = tasks > 1 ? TRACE_ALL : TRACE_ENTER; user_regs_struct_t regs; int status, ret; pid_t pid; - if (tasks > 1) - trace = TRACE_ALL; - /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); @@ -1594,6 +1660,18 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, pr_debug("%d was trapped\n", pid); + if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) { + /* + * On some platforms such as ARM64, it is impossible to + * pass through a breakpoint, so let's clear it right + * after it has been triggered. + */ + if (ptrace_flush_breakpoints(pid)) { + pr_err("Unable to clear breakpoints\n"); + return -1; + } + goto goon; + } if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); @@ -1707,3 +1785,11 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) { SET_REG_IP(tctl->th.regs, v); } + +void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack) +{ + if (rstack) + *rstack = ctl->rstack; + if (r_thread_stack) + *r_thread_stack = ctl->r_thread_stack; +} diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c index 49b685d707..717ee28390 100644 --- a/compel/src/lib/ptrace.c +++ b/compel/src/lib/ptrace.c @@ -23,7 +23,7 @@ int ptrace_suspend_seccomp(pid_t pid) { - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD) < 0) { pr_perror("suspending seccomp failed"); return -1; } diff --git a/compel/test/Makefile b/compel/test/Makefile index 63fb76f80d..f46a821ee8 100644 --- a/compel/test/Makefile +++ b/compel/test/Makefile @@ -1,4 +1,4 @@ -all: fdspy infect rsys +all: fdspy infect rsys stack fdspy: $(Q) $(MAKE) -C fdspy @@ -10,8 +10,12 @@ infect: $(Q) $(MAKE) -C infect run .PHONY: infect - rsys: $(Q) $(MAKE) -C rsys $(Q) $(MAKE) -C rsys run .PHONY: rsys + +stack: + $(Q) $(MAKE) -C stack + $(Q) $(MAKE) -C stack run +.PHONY: stack diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index e7273b446a..b10db4d472 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -94,15 +94,15 @@ static inline int chk(int fd, int val) int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) - return 0; + return 1; printf("%d, want %d\n", v, val); - return v == val; + return v != val; } int main(int argc, char **argv) { - int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; /* * Prepare IO-s and fork the victim binary @@ -142,9 +142,11 @@ int main(int argc, char **argv) return 1; printf("Checking the victim alive\n"); - pass = chk(p_out[0], 1); - pass = chk(p_out[0], 42); - if (!pass) + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) return 1; /* @@ -176,14 +178,14 @@ int main(int argc, char **argv) printf("Checking the result\n"); /* These two came from parasite */ - pass = chk(p_out[0], 138); - pass = chk(p_out[0], 403); + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); /* These two came from post-infect */ - pass = chk(p_out[0], 1234); - pass = chk(p_out[0], 4096); + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); - if (pass) + if (!err) printf("All OK\n"); else printf("Something went WRONG\n"); diff --git a/compel/test/stack/.gitignore b/compel/test/stack/.gitignore new file mode 100644 index 0000000000..0a554758d1 --- /dev/null +++ b/compel/test/stack/.gitignore @@ -0,0 +1,4 @@ +parasite.h +parasite.po +spy +victim diff --git a/compel/test/stack/Makefile b/compel/test/stack/Makefile new file mode 100644 index 0000000000..bacfad9624 --- /dev/null +++ b/compel/test/stack/Makefile @@ -0,0 +1,32 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +run: + ./spy +.PHONY: run + +clean: + rm -f victim + rm -f spy + rm -f parasite.h + rm -f parasite.po + rm -f parasite.o + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c parasite.h + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) + +parasite.h: parasite.po + $(COMPEL) hgen -o $@ -f $< + +parasite.po: parasite.o + ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) + +parasite.o: parasite.c + $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/compel/test/stack/parasite.c b/compel/test/stack/parasite.c new file mode 100644 index 0000000000..ad13bd25de --- /dev/null +++ b/compel/test/stack/parasite.c @@ -0,0 +1,38 @@ +#include + +#include +#include + +/* + * Stubs for std compel plugin. + */ +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} +void parasite_cleanup(void) +{ +} + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +int parasite_daemon_cmd(int cmd, void *args) +{ + int v; + + switch (cmd) { + case PARASITE_CMD_INC: + v = (*(int *)args) + 1; + break; + case PARASITE_CMD_DEC: + v = (*(int *)args) - 1; + break; + default: + v = -1; + break; + } + + sys_write(1, &v, sizeof(int)); + return 0; +} diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c new file mode 100644 index 0000000000..9b7c9a7f09 --- /dev/null +++ b/compel/test/stack/spy.c @@ -0,0 +1,405 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "parasite.h" + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) + +void *saved_data = NULL; + +#define SAVED_DATA_MAX page_size() + +void cleanup_saved_data(void) +{ + free(saved_data); +} + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static void *get_parasite_rstack_start(struct parasite_ctl *ctl) +{ + void *rstack, *r_thread_stack, *rstack_start; + + compel_get_stack(ctl, &rstack, &r_thread_stack); + + rstack_start = rstack; + if (r_thread_stack != NULL && r_thread_stack < rstack_start) + rstack_start = r_thread_stack; + + return rstack_start; +} + +static int page_writable(struct parasite_ctl *ctl, int pid, void *page) +{ + FILE *maps; + size_t maps_line_len = 0; + char *maps_line = NULL; + char victim_maps_path[6 + 11 + 5 + 1]; + int written; + int ret = 0; + + if (((uintptr_t)page & (page_size() - 1)) != 0) { + fprintf(stderr, "Page address not aligned\n"); + ret = -1; + goto done; + } + + written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); + if (written < 0 || written >= sizeof(victim_maps_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); + ret = -1; + goto done; + } + + maps = fopen(victim_maps_path, "r"); + if (maps == NULL) { + perror("Can't open victim's /proc/$pid/maps"); + ret = -1; + goto done; + } + + while (getline(&maps_line, &maps_line_len, maps) != -1) { + unsigned long vmstart, vmend; + char r, w; + + if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { + fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); + ret = -1; + goto free_linebuf; + } + + if (page >= (void *)vmstart && page < (void *)vmend) { + if (w == 'w') { + if (r != 'r') { + fprintf(stderr, "Expecting writable memory to also be readable"); + ret = -1; + goto free_linebuf; + } + ret = 1; + } + break; + } + } + + if (errno) { + perror("Can't read victim's /proc/$pid/maps"); + ret = -1; + } + +free_linebuf: + free(maps_line); + fclose(maps); +done: + return ret; +} + +static void *read_proc_mem(int pid, void *offset, size_t len) +{ + char victim_mem_path[6 + 11 + 4 + 1]; + int written; + int fd; + void *data; + ssize_t mem_read; + + written = snprintf(victim_mem_path, sizeof(victim_mem_path), "/proc/%d/mem", pid); + if (written < 0 || written >= sizeof(victim_mem_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/mem file\n", pid); + return NULL; + } + + fd = open(victim_mem_path, O_RDONLY); + if (fd < 0) { + perror("Failed to open victim's /proc/$pid/mem file"); + return NULL; + } + + data = malloc(len); + if (data == NULL) { + perror("Can't allocate memory to read victim's /proc/$pid/mem file"); + return NULL; + } + + mem_read = pread(fd, data, len, (off_t)offset); + if (mem_read == -1) { + perror("Failed to read victim's /proc/$pid/mem file"); + goto freebuf; + } + + return data; + +freebuf: + free(data); + return NULL; +} + +static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, + size_t *saved_data_size) +{ + size_t page_mask = page_size() - 1; + size_t saved_size = 0; + size_t stack_size_last_page = (uintptr_t)stack & page_mask; + void *next_page = stack; + + if (stack_size_last_page != 0) { + size_t empty_space_last_page = page_size() - stack_size_last_page; + saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); + next_page += page_size() - stack_size_last_page; + } + + while (saved_size < SAVED_DATA_MAX && next_page != NULL) { + switch (page_writable(ctl, pid, next_page)) { + case 1: + saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); + next_page += page_size(); + break; + case 0: + next_page = NULL; + break; + default: + return -1; + } + } + + if (saved_size > 0) { + void *sd; + + sd = read_proc_mem(pid, stack, saved_size); + if (sd == NULL) + return -1; + + *saved_data = sd; + } else { + *saved_data = NULL; + } + + *saved_data_size = saved_size; + + return 0; +} + +static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) +{ + if (saved_data != NULL) { + void *current_data; + + current_data = read_proc_mem(pid, stack, saved_data_size); + if (current_data == NULL) + return -1; + + if (memcmp(saved_data, current_data, saved_data_size) != 0) + return 1; + } + + return 0; +} + +static int do_infection(int pid) +{ + int state; + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + int *arg; + void *stack; + size_t saved_data_size; + int saved_data_check; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task\n"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection\n"); + + printf("Configuring contexts\n"); + + /* + * First -- the infection context. Most of the stuff + * is already filled by compel_prepare(), just set the + * log descriptor for parasite side, library cannot + * live w/o it. + */ + ictx = compel_infect_ctx(ctl); + ictx->log_fd = STDERR_FILENO; + + parasite_setup_c_header(ctl); + + printf("Infecting\n"); + if (compel_infect_no_daemon(ctl, 1, sizeof(int))) + err_and_ret("Can't infect victim\n"); + + if (atexit(cleanup_saved_data)) + err_and_ret("Can't register cleanup function with atexit\n"); + + stack = get_parasite_rstack_start(ctl); + if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) + err_and_ret("Can't save data above stack\n"); + + if (compel_start_daemon(ctl)) + err_and_ret("Can't start daemon in victim\n"); + + /* + * Now get the area with arguments and run two + * commands one by one. + */ + arg = compel_parasite_args(ctl, int); + + printf("Running cmd 1\n"); + *arg = 137; + if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) + err_and_ret("Can't run parasite command 1\n"); + + printf("Running cmd 2\n"); + *arg = 404; + if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) + err_and_ret("Can't run parasite command 2\n"); + + saved_data_check = check_saved_data(ctl, pid, stack, saved_data, saved_data_size); + if (saved_data_check == -1) + err_and_ret("Could not check saved data\n"); + if (saved_data_check != 0) + err_and_ret("Saved data unexpectedly modified\n"); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim\n"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task\n"); + + printf("Done\n"); + + return 0; +} + +static inline int chk(int fd, int val) +{ + int v = 0; + + if (read(fd, &v, sizeof(v)) != sizeof(v)) + return 1; + + printf("%d, want %d\n", v, val); + return v != val; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + pid = vfork(); + if (pid == 0) { + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); + + /* + * Tell the little guy some numbers + */ + i = 1; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 42; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + printf("Checking the victim alive\n"); + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) + return 1; + + /* + * Now do the infection with parasite.c + */ + + printf("Infecting the victim\n"); + if (do_infection(pid)) + return 1; + + /* + * Tell the victim some more stuff to check it's alive + */ + i = 1234; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 4096; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + /* + * Stop the victim and check the infection went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the result\n"); + + /* These two came from parasite */ + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); + + /* These two came from post-infect */ + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); + + if (!err) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/compel/test/stack/victim.c b/compel/test/stack/victim.c new file mode 100644 index 0000000000..f94613fa15 --- /dev/null +++ b/compel/test/stack/victim.c @@ -0,0 +1,16 @@ +#include + +int main(int argc, char **argv) +{ + int i; + + while (1) { + if (read(0, &i, sizeof(i)) != sizeof(i)) + break; + + if (write(1, &i, sizeof(i)) != sizeof(i)) + break; + } + + return 0; +} diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index b3a7ca6365..dfa31569fa 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -78,7 +78,7 @@ int cpu_dump_cpuinfo(void) cpu_info.n_x86_entry = 1; cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? CPUINFO_X86_ENTRY__VENDOR__INTEL : - CPUINFO_X86_ENTRY__VENDOR__AMD; + CPUINFO_X86_ENTRY__VENDOR__AMD; cpu_x86_info.cpu_family = rt_cpu_info.x86_family; cpu_x86_info.model = rt_cpu_info.x86_model; diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index f177b9e7b8..d10e51e480 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -409,7 +409,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { fpu_state_t *fpu_state = core_is_compat(core) ? &sigframe->compat.fpu_state : &sigframe->native.fpu_state; struct xsave_struct *x = core_is_compat(core) ? (void *)&fpu_state->fpu_state_ia32.xsave : - (void *)&fpu_state->fpu_state_64.xsave; + (void *)&fpu_state->fpu_state_64.xsave; /* * If no FPU information provided -- we're restoring diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index 5bed7dd9d9..1b85c5b5a2 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -35,12 +35,29 @@ static const char *____criu_global_props____[] = { "tasks", }; +/* cgroup2 global properties */ +// clang-format off +static const char *____criu_global_props_v2____[] = { + "cgroup.subtree_control", + "cgroup.max.descendants", + "cgroup.max.depth", + "cgroup.freeze", + "cgroup.type", +}; +// clang-format on + cgp_t cgp_global = { .name = "____criu_global_props____", .nr_props = ARRAY_SIZE(____criu_global_props____), .props = ____criu_global_props____, }; +cgp_t cgp_global_v2 = { + .name = "____criu_global_props_v2____", + .nr_props = ARRAY_SIZE(____criu_global_props_v2____), + .props = ____criu_global_props_v2____, +}; + typedef struct { struct list_head list; cgp_t cgp; diff --git a/criu/cgroup.c b/criu/cgroup.c index e05b0832ed..918827d993 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "common/list.h" #include "xmalloc.h" @@ -27,6 +28,7 @@ #include "images/cgroup.pb-c.h" #include "kerndat.h" #include "linux/mount.h" +#include "syscall.h" /* * This structure describes set of controller groups @@ -54,6 +56,7 @@ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; +static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { @@ -173,6 +176,7 @@ struct cg_controller *new_controller(const char *name) nc->n_controllers = 1; nc->n_heads = 0; + nc->is_threaded = false; INIT_LIST_HEAD(&nc->heads); return nc; @@ -370,7 +374,8 @@ static void free_all_cgroup_props(struct cgroup_dir *ncd) ncd->n_properties = 0; } -static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) +static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp, + struct cg_controller *controller) { int j; char buf[PATH_MAX]; @@ -421,6 +426,13 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; } + /* + * Set the is_threaded flag if cgroup.type's value is threaded, + * ignore all other values. + */ + if (!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) + controller->is_threaded = true; + pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; @@ -436,12 +448,20 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); - if (dump_cg_props_array(fpath, ncd, cgp) < 0) { + if (dump_cg_props_array(fpath, ncd, cgp, controller) < 0) { pr_err("dumping known properties failed\n"); return -1; } + } - if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { + /* cgroup v2 */ + if (controller->controllers[0][0] == 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global_v2, controller) < 0) { + pr_err("dumping global properties v2 failed\n"); + return -1; + } + } else { + if (dump_cg_props_array(fpath, ncd, &cgp_global, controller) < 0) { pr_err("dumping global properties failed\n"); return -1; } @@ -726,20 +746,28 @@ static int collect_cgroups(struct list_head *ctls) return 0; } -int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) +int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args, int id) { - int pid; + int pid, tid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; + if (opts.unprivileged) + return 0; + if (item) pid = item->pid->real; else pid = getpid(); - pr_info("Dumping cgroups for %d\n", pid); - if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) + if (id < 0) + tid = pid; + else + tid = item->threads[id].real; + + pr_info("Dumping cgroups for thread %d\n", tid); + if (parse_thread_cgroup(pid, tid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); @@ -752,9 +780,10 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { - BUG_ON(root_cgset); - root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); + if (!root_cgset) { + root_cgset = cs; + pr_info("Set %d is root one\n", cs->id); + } } else { struct cg_ctl *root, *stray; @@ -901,6 +930,7 @@ static int dump_controllers(CgroupEntry *cg) list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); + ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; @@ -988,6 +1018,9 @@ int dump_cgroups(void) CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; + if (opts.unprivileged) + return 0; + BUG_ON(!criu_cgset || !root_cgset); /* @@ -1054,8 +1087,15 @@ static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt * it. We restore these properties as soon as the cgroup is created. */ static const char *special_props[] = { - "cpuset.cpus", "cpuset.mems", "devices.list", "memory.kmem.limit_in_bytes", - "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", NULL, + "cpuset.cpus", + "cpuset.mems", + "devices.list", + "memory.kmem.limit_in_bytes", + "memory.swappiness", + "memory.oom_control", + "memory.use_hierarchy", + "cgroup.type", + NULL, }; bool is_special_property(const char *prop) @@ -1296,11 +1336,75 @@ static int restore_perms(int fd, const char *path, CgroupPerms *perms) return 0; } +static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) +{ + char *current, *next; + size_t len, off = 0; + + current = input; + do { + next = strchrnul(current, ' '); + len = next - current; + + output[off] = prefix; + off++; + memcpy(output + off, current, len); + off += len; + output[off] = ' '; + off++; + + current = next + 1; + } while (*next != '\0'); + + return off; +} + +static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p, int fd) +{ + char buf[1024]; + char line[1024]; + int ret, off = 0; + + ret = read(fd, buf, sizeof(buf) - 1); + if (ret < 0) { + pr_perror("read from cgroup.subtree_control"); + return ret; + } + /* Remove the trailing newline */ + buf[ret] = '\0'; + + /* Remove all current subsys in subtree_control */ + if (buf[0] != '\0') + off = add_subtree_control_prop_prefix(buf, line, '-'); + + /* Add subsys need to be restored in subtree_control */ + if (cg_prop_entry_p->value[0] != '\0') + off += add_subtree_control_prop_prefix(cg_prop_entry_p->value, line + off, '+'); + + /* Remove the trailing space */ + if (off != 0) { + off--; + line[off] = '\0'; + } + + if (write(fd, line, off) != off) { + pr_perror("write to cgroup.subtree_control"); + return -1; + } + + return 0; +} + +/* + * Note: The path string can be modified in this function, + * the length of path string should be at least PATH_MAX. + */ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { - int cg, fd, ret = -1; + int cg, fd, ret = -1, flag; CgroupPerms *perms = cg_prop_entry_p->perms; + int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); if (opts.manage_cgroups == CG_MODE_IGNORE) return 0; @@ -1317,8 +1421,13 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); + if (is_subtree_control) + flag = O_RDWR; + else + flag = O_WRONLY; + cg = get_service_fd(CGROUP_YARD); - fd = openat(cg, path, O_WRONLY); + fd = openat(cg, path, flag); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; @@ -1333,6 +1442,17 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat goto out; } + if (is_subtree_control) { + ret = restore_cgroup_subtree_control(cg_prop_entry_p, fd); + goto out; + } + + /* skip restoring cgroup.type if its value is not "threaded" */ + if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { + ret = 0; + goto out; + } + if (split_lines) { char *line = cg_prop_entry_p->value; char *next_line; @@ -1677,12 +1797,9 @@ static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux return -1; for (j = 0; j < n_controllers; j++) { - if (!strcmp(controllers[j], "cpuset") || !strcmp(controllers[j], "memory") || - !strcmp(controllers[j], "devices")) { - if (restore_special_props(paux, off2, e) < 0) { - pr_err("Restoring special cpuset props failed!\n"); - return -1; - } + if (restore_special_props(paux, off2, e) < 0) { + pr_err("Restoring special cpuset props failed!\n"); + return -1; } } } else { @@ -1820,6 +1937,136 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +/* + * If a thread is a different cgroup set than the main thread in process, + * it means it is in a threaded controller. This daemon receives the cg_set + * number from the restored thread and move this thread to the correct + * cgroup controllers + */ +static int cgroupd(int sk) +{ + pr_info("cgroud: Daemon started\n"); + + while (1) { + struct unsc_msg um; + uns_call_t call; + pid_t tid; + int fd, cg_set, i; + CgSetEntry *cg_set_entry; + int ret; + + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); + ret = recvmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("cgroupd: recv req error"); + return -1; + } + + unsc_msg_pid_fd(&um, &tid, &fd); + pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); + + cg_set_entry = find_rst_set_by_id(cg_set); + if (!cg_set_entry) { + pr_err("cgroupd: No set found %d\n", cg_set); + return -1; + } + + for (i = 0; i < cg_set_entry->n_ctls; i++) { + int j, aux_off; + CgMemberEntry *ce = cg_set_entry->ctls[i]; + char aux[PATH_MAX]; + CgControllerEntry *ctrl = NULL; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + /* + * This is not a threaded controller, all threads in this + * process must be in this controller. Main thread has been + * restored, so this thread is in this controller already. + */ + if (!ctrl->is_threaded) + continue; + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); + + /* + * Cgroupd runs outside of the namespaces so we don't + * need to use userns_call here + */ + if (userns_move(aux, 0, tid)) { + pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); + return -1; + } + } + + /* + * We only want to send the cred which contains thread id back. + * The restored thread recvmsg(MSG_PEEK) until it gets its own + * thread id. + */ + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); + if (sendmsg(sk, &um.h, 0) <= 0) { + pr_perror("cgroupd: send req error"); + return -1; + } + } + + return 0; +} + +int stop_cgroupd(void) +{ + if (cgroupd_pid) { + sigset_t blockmask, oldmask; + + /* + * Block the SIGCHLD signal to avoid triggering + * sigchld_handler() + */ + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + kill(cgroupd_pid, SIGTERM); + waitpid(cgroupd_pid, NULL, 0); + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + } + + return 0; +} + +static int prepare_cgroup_thread_sfd(void) +{ + int sk; + + sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); + if (sk < 0) { + pr_err("failed to start cgroupd\n"); + return -1; + } + + if (install_service_fd(CGROUPD_SK, sk) < 0) { + kill(cgroupd_pid, SIGKILL); + waitpid(cgroupd_pid, NULL, 0); + return -1; + } + + return 0; +} + static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); @@ -1974,15 +2221,19 @@ int prepare_cgroup(void) n_controllers = ce->n_controllers; controllers = ce->controllers; - if (n_sets) + if (n_sets) { /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); - else + if (ret < 0) + return ret; + ret = prepare_cgroup_thread_sfd(); + } else { ret = 0; + } return ret; } diff --git a/criu/config.c b/criu/config.c index 14a11f9c3e..234af2f21a 100644 --- a/criu/config.c +++ b/criu/config.c @@ -430,6 +430,7 @@ void init_opts(void) opts.pre_dump_mode = PRE_DUMP_SPLICE; opts.file_validation_method = FILE_VALIDATION_DEFAULT; opts.network_lock_method = NETWORK_LOCK_DEFAULT; + opts.ghost_fiemap = FIEMAP_DEFAULT; } bool deprecated_ok(char *what) @@ -696,14 +697,21 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097 }, { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), + BOOL_OPT("unprivileged", &opts.unprivileged), + BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT("keep-link-remaps", &opts.keep_link_remaps), {}, }; #undef BOOL_OPT + if (argv && argv[0]) + SET_CHAR_OPTS(argv_0, argv[0]); + ret = pre_parse(argc, argv, usage_error, &no_default_config, &cfg_file); if (ret) @@ -1115,6 +1123,11 @@ int check_options(void) } } + if (opts.track_mem && !kdat.has_dirty_track) { + pr_err("Tracking memory is not available. Consider omitting --track-mem option.\n"); + return 1; + } + if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/cr-check.c b/criu/cr-check.c index a172806f5c..b54c79387d 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include "../soccr/soccr.h" @@ -30,7 +30,7 @@ #include "sockets.h" #include "crtools.h" #include "log.h" -#include "util-pie.h" +#include "util-caps.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" @@ -52,6 +52,7 @@ #include "net.h" #include "restorer.h" #include "uffd.h" +#include "linux/aio_abi.h" #include "images/inventory.pb-c.h" @@ -104,7 +105,7 @@ static int check_tty(void) static int check_apparmor_stacking(void) { - if (!check_aa_ns_dumping()) + if (!kdat.apparmor_ns_dumping_enabled) return -1; return 0; @@ -515,6 +516,14 @@ static int check_ipc(void) { int ret; + /* + * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however + * for non-root users access() runs with an empty set of caps and will therefore always + * fail. + */ + if (opts.uid) + return 0; + ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; @@ -1039,10 +1048,14 @@ static int check_tcp(void) } val = 1; - ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); - if (ret < 0) { - pr_perror("Can't turn TCP repair mode ON"); - goto out; + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; + } + } else { + pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); } optlen = sizeof(val); @@ -1291,7 +1304,7 @@ static int check_net_diag_raw(void) { check_sock_diag(); return (socket_test_collect_bit(AF_INET, IPPROTO_RAW) && socket_test_collect_bit(AF_INET6, IPPROTO_RAW)) ? 0 : - -1; + -1; } static int check_pidfd_store(void) @@ -1394,9 +1407,6 @@ int cr_check(void) struct ns_id *ns; int ret = 0; - if (!is_root_user()) - return -1; - root_item = alloc_pstree_item(); if (root_item == NULL) return -1; @@ -1478,13 +1488,15 @@ int cr_check(void) ret |= check_newifindex(); ret |= check_pidfd_store(); ret |= check_ns_pid(); - ret |= check_apparmor_stacking(); ret |= check_network_lock_nftables(); ret |= check_sockopt_buf_lock(); ret |= check_memfd_hugetlb(); ret |= check_move_mount_set_group(); ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); + + if (kdat.lsm == LSMTYPE__APPARMOR) + ret |= check_apparmor_stacking(); } /* @@ -1653,3 +1665,54 @@ static char *feature_name(int (*func)(void)) } return NULL; } + +static int pr_set_dumpable(int value) +{ + int ret = prctl(PR_SET_DUMPABLE, value, 0, 0, 0); + if (ret < 0) + pr_perror("Unable to set PR_SET_DUMPABLE"); + return ret; +} + +int check_caps(void) +{ + /* Read out effective capabilities and store in opts.cap_eff. */ + if (set_opts_cap_eff()) + goto out; + + /* + * No matter if running as root or not. CRIU always needs + * at least these capabilities. + */ + if (!has_cap_checkpoint_restore(opts.cap_eff)) + goto out; + + /* For some things we need to know if we are running as root. */ + opts.uid = geteuid(); + + if (!opts.uid) { + /* CRIU is running as root. No further checks are necessary. */ + return 0; + } + + if (!opts.unprivileged) { + pr_msg("Running as non-root requires '--unprivileged'\n"); + pr_msg("Please consult the documentation for limitations when running as non-root\n"); + return -1; + } + + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + + return 0; +out: + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + + return -1; +} diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f58701e5c5..63eb627fc2 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -759,6 +759,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item pid_t pid = item->pid->real; int ret = -1; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; + u32 *cg_set; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); @@ -781,6 +782,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[0]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + if (core->tc->task_state == TASK_STOPPED) { + core->tc->has_stop_signo = true; + core->tc->stop_signo = item->pid->stop_signo; + } + ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; @@ -799,13 +805,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; + strcpy(cgroup_args.thread_cgrp, "self/cgroup"); ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } - core->tc->has_cg_set = true; - ret = dump_task_cgroup(item, &core->tc->cg_set, info); + cg_set = &core->thread_core->cg_set; + ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; @@ -1034,7 +1041,7 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } -static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct rseq_cs *rseq_cs, +static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct criu_rseq_cs *rseq_cs, struct criu_rseq *rseq) { int ret; @@ -1065,10 +1072,11 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, st if (!rseq->rseq_cs) return 0; - ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct rseq_cs)); + ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct criu_rseq_cs)); if (ret) { pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, - (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, (unsigned long)sizeof(struct rseq_cs)); + (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, + (unsigned long)sizeof(struct criu_rseq_cs)); return -1; } @@ -1083,7 +1091,7 @@ static int dump_thread_rseq(struct pstree_item *item, int i) CoreEntry *core = item->core[i]; RseqEntry **rseqep = &core->thread_core->rseq_entry; struct criu_rseq rseq = {}; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* @@ -1149,7 +1157,7 @@ static int dump_thread_rseq(struct pstree_item *item, int i) static int dump_task_rseq(pid_t pid, struct pstree_item *item) { int i; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* if rseq() syscall isn't supported then nothing to dump */ if (!kdat.has_rseq) @@ -1174,7 +1182,7 @@ static int dump_task_rseq(pid_t pid, struct pstree_item *item) return -1; } -static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) +static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) { return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; } @@ -1182,7 +1190,7 @@ static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) static int fixup_thread_rseq(struct pstree_item *item, int i) { CoreEntry *core = item->core[i]; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* equivalent to (struct rseq)->rseq_cs is NULL */ @@ -1403,6 +1411,38 @@ static int dump_zombies(void) return ret; } +static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +{ + struct parasite_dump_cgroup_args cgroup_args, *info; + int i; + + BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); + for (i = 0; i < item->nr_threads; i++) { + CoreEntry *core = item->core[i]; + + /* Leader is already dumped */ + if (item->pid->real == item->threads[i].real) + continue; + + /* For now, we only need to dump the root task's cgroup ns, because we + * know all the tasks are in the same cgroup namespace because we don't + * allow nesting. + */ + info = NULL; + if (item->ids->has_cgroup_ns_id && !item->parent) { + info = &cgroup_args; + sprintf(cgroup_args.thread_cgrp, "self/task/%d/cgroup", item->threads[i].ns[0].virt); + if (parasite_dump_cgroup(parasite_ctl, &cgroup_args)) + return -1; + } + + if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) + return -1; + } + + return 0; +} + static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; @@ -1675,6 +1715,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; } + ret = dump_task_cgroup(parasite_ctl, item); + if (ret) { + pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); @@ -2049,7 +2095,7 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); close_image_dir(); - if (ret) { + if (ret || post_dump_ret) { pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9853c05854..974202f16f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -22,7 +22,6 @@ #include #include "common/compiler.h" -#include "linux/mount.h" #include "linux/rseq.h" #include "clone-noasan.h" @@ -86,6 +85,8 @@ #include #include "compel/include/asm/syscall.h" +#include "linux/mount.h" + #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" @@ -351,6 +352,10 @@ static int root_prepare_shared(void) if (ret) goto err; + ret = add_fake_unix_queuers(); + if (ret) + goto err; + /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. @@ -367,10 +372,6 @@ static int root_prepare_shared(void) if (ret) goto err; - ret = add_fake_unix_queuers(); - if (ret) - goto err; - show_saved_files(); err: return ret; @@ -1348,7 +1349,18 @@ static inline int fork_with_pid(struct pstree_item *item) return -1; item->pid->state = ca.core->tc->task_state; - rsti(item)->cg_set = ca.core->tc->cg_set; + + /* + * Zombie tasks' cgroup is not dumped/restored. + * cg_set == 0 is skipped in prepare_task_cgroup() + */ + if (item->pid->state == TASK_DEAD) + rsti(item)->cg_set = 0; + else + rsti(item)->cg_set = ca.core->thread_core->cg_set; + + if (ca.core->tc->has_stop_signo) + item->pid->stop_signo = ca.core->tc->stop_signo; if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); @@ -1805,6 +1817,9 @@ static int restore_task_with_children(void *_arg) goto err; } + if (set_opts_cap_eff()) + goto err; + /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; @@ -1960,6 +1975,10 @@ static int attach_to_tasks(bool root_seized) return -1; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -2024,7 +2043,7 @@ static int restore_rseq_cs(void) return 0; } -static int catch_tasks(bool root_seized, enum trace_flags *flag) +static int catch_tasks(bool root_seized) { struct pstree_item *item; @@ -2054,7 +2073,7 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, flag, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); if (ret < 0) return -1; } @@ -2063,24 +2082,6 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return 0; } -static int clear_breakpoints(void) -{ - struct pstree_item *item; - int ret = 0, i; - - if (fault_injected(FI_NO_BREAKPOINTS)) - return 0; - - for_each_pstree_item(item) { - if (!task_alive(item)) - continue; - for (i = 0; i < item->nr_threads; i++) - ret |= ptrace_flush_breakpoints(item->threads[i].real); - } - - return ret; -} - static void finalize_restore(void) { struct pstree_item *item; @@ -2104,8 +2105,14 @@ static void finalize_restore(void) xfree(ctl); - if ((item->pid->state == TASK_STOPPED) || (opts.final_state == TASK_STOPPED)) + if (opts.final_state == TASK_STOPPED) kill(item->pid->real, SIGSTOP); + else if (item->pid->state == TASK_STOPPED) { + if (item->pid->stop_signo > 0) + kill(item->pid->real, item->pid->stop_signo); + else + kill(item->pid->real, SIGSTOP); + } } } @@ -2215,7 +2222,6 @@ static void reap_zombies(void) static int restore_root_task(struct pstree_item *init) { - enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item; @@ -2378,6 +2384,10 @@ static int restore_root_task(struct pstree_item *init) if (ret < 0) goto out_kill; + ret = stop_cgroupd(); + if (ret < 0) + goto out_kill; + ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; @@ -2430,7 +2440,7 @@ static int restore_root_task(struct pstree_item *init) timing_stop(TIME_RESTORE); - if (catch_tasks(root_seized, &flag)) { + if (catch_tasks(root_seized)) { pr_err("Can't catch all tasks\n"); goto out_kill_network_unlocked; } @@ -2440,15 +2450,12 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_COMPLETE); - ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1)); if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; } - if (clear_breakpoints()) - pr_err("Unable to flush breakpoints\n"); - finalize_restore(); /* * Some external devices such as GPUs might need a very late @@ -3078,7 +3085,6 @@ static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) return 0; } -#if defined(__GLIBC__) && defined(RSEQ_SIG) static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { if (!kdat.has_rseq) { @@ -3086,23 +3092,21 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) return; } - rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); - rseq->rseq_abi_size = __rseq_size; - rseq->signature = RSEQ_SIG; -} + if (!kdat.has_ptrace_get_rseq_conf) { +#if defined(__GLIBC__) && defined(RSEQ_SIG) + rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + rseq->rseq_abi_size = __rseq_size; + rseq->signature = RSEQ_SIG; #else -static void prep_libc_rseq_info(struct rst_rseq_param *rseq) -{ - /* - * TODO: handle built-in rseq on other libc'ies like musl - * We can do that using get_rseq_conf kernel feature. - * - * For now we just assume that other libc libraries are - * not registering rseq by default. - */ - rseq->rseq_abi_pointer = 0; -} + rseq->rseq_abi_pointer = 0; #endif + return; + } + + rseq->rseq_abi_pointer = kdat.libc_rseq_conf.rseq_abi_pointer; + rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; + rseq->signature = kdat.libc_rseq_conf.signature; +} static rlim_t decode_rlim(rlim_t ival) { @@ -3759,6 +3763,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns prep_libc_rseq_info(&task_args->libc_rseq); + task_args->uid = opts.uid; + for (i = 0; i < CR_CAP_SIZE; i++) + task_args->cap_eff[i] = opts.cap_eff[i]; + /* * Fill up per-thread data. */ @@ -3816,6 +3824,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); + if (rsti(current)->cg_set != tcore->thread_core->cg_set) { + thread_args[i].cg_set = tcore->thread_core->cg_set; + thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); + } else { + thread_args[i].cg_set = -1; + } + ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); if (ret) goto err; @@ -3910,6 +3925,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); + close_service_fd(CGROUPD_SK); __gcov_flush(); diff --git a/criu/cr-service.c b/criu/cr-service.c index a6eb9ebd30..73c48f5a6c 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "version.h" #include "crtools.h" @@ -409,6 +410,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req) pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); } + if (req->has_unprivileged) + opts.unprivileged = req->unprivileged; + + if (check_caps()) + return 1; + if (kerndat_init()) return 1; @@ -464,6 +471,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_shell_job) opts.shell_job = req->shell_job; + if (req->has_skip_file_rwx_check) + opts.skip_file_rwx_check = req->skip_file_rwx_check; + if (req->has_file_locks) opts.handle_file_locks = req->file_locks; diff --git a/criu/crtools.c b/criu/crtools.c index cc8d9179fe..832a36d596 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -185,6 +185,9 @@ int main(int argc, char *argv[], char *envp[]) return cr_service_work(atoi(argv[optind + 1])); } + if (check_caps()) + return 1; + if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); @@ -414,6 +417,8 @@ int main(int argc, char *argv[], char *envp[]) " --network-lock METHOD\n" " network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" + " --unprivileged accept limitations when running as non-root\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" @@ -440,6 +445,7 @@ int main(int argc, char *argv[], char *envp[]) " --evasive-devices use any path to a device file if the original one\n" " is inaccessible\n" " --link-remap allow one to link unlinked files back when possible\n" + " --keep-link-remaps On restore, don't automatically remove link remaps.\n" " --ghost-limit size limit max size of deleted file contents inside image\n" " --action-script FILE add an external action script\n" " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" @@ -504,6 +510,9 @@ int main(int argc, char *argv[], char *envp[]) " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" + " --skip-file-rwx-check\n" + " Skip checking file permissions\n" + " (r/w/x for u/g/o) on restore.\n" "\n" "Check options:\n" " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/fdstore.c b/criu/fdstore.c index 6a7f73a598..d615ad15d0 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -13,6 +13,9 @@ #include "rst-malloc.h" #include "log.h" #include "util.h" +#include "cr_options.h" +#include "util-caps.h" +#include "sockets.h" /* clang-format off */ static struct fdstore_desc { @@ -49,9 +52,7 @@ int fdstore_init(void) return -1; } - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(sk, buf)) { close(sk); return -1; } diff --git a/criu/files-reg.c b/criu/files-reg.c index 0249063c26..62da91be2d 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -11,8 +11,13 @@ #include #include #include -#include +#include #include +#include +#include + +#include "tty.h" +#include "stats.h" #ifndef SEEK_DATA #define SEEK_DATA 3 @@ -29,6 +34,8 @@ * and checked. */ #define BUILD_ID_MAP_SIZE 1048576 +#define ST_UNIT 512 +#define EXTENT_MAX_COUNT 512 #include "cr_options.h" #include "imgset.h" @@ -218,6 +225,92 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) return 0; } +static int skip_outstanding(struct fiemap_extent *fe, size_t file_size) +{ + /* Skip outstanding extent */ + if (fe->fe_logical > file_size) + return 1; + + /* Skip outstanding part of the extent */ + if (fe->fe_logical + fe->fe_length > file_size) + fe->fe_length = file_size - fe->fe_logical; + return 0; +} + +static int copy_file_to_chunks_fiemap(int fd, struct cr_img *img, size_t file_size) +{ + GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; + struct fiemap *fiemap_buf; + struct fiemap_extent *ext_buf; + int ext_buf_size, fie_buf_size; + off_t pos = 0; + unsigned int i; + int ret = 0; + int exit_code = 0; + + ext_buf_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent); + fie_buf_size = sizeof(struct fiemap) + ext_buf_size; + + fiemap_buf = xzalloc(fie_buf_size); + if (!fiemap_buf) { + pr_perror("Out of memory when allocating fiemap"); + return -1; + } + + ext_buf = fiemap_buf->fm_extents; + fiemap_buf->fm_length = FIEMAP_MAX_OFFSET; + fiemap_buf->fm_flags |= FIEMAP_FLAG_SYNC; + fiemap_buf->fm_extent_count = EXTENT_MAX_COUNT; + + do { + fiemap_buf->fm_start = pos; + memzero(ext_buf, ext_buf_size); + ret = ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + exit_code = -EOPNOTSUPP; + } else { + exit_code = -1; + pr_perror("fiemap ioctl() failed"); + } + goto out; + } else if (fiemap_buf->fm_mapped_extents == 0) { + goto out; + } + + for (i = 0; i < fiemap_buf->fm_mapped_extents; i++) { + if (skip_outstanding(&fiemap_buf->fm_extents[i], file_size)) + continue; + + ce.len = fiemap_buf->fm_extents[i].fe_length; + ce.off = fiemap_buf->fm_extents[i].fe_logical; + + if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) { + exit_code = -1; + goto out; + } + + if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) { + exit_code = -1; + goto out; + } + + if (fiemap_buf->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) { + /* there are no extents left, break. */ + goto out; + } + } + + /* Record file's logical offset as pos */ + pos = ce.len + ce.off; + + /* Since there are still extents left, continue. */ + } while (fiemap_buf->fm_mapped_extents == EXTENT_MAX_COUNT); +out: + xfree(fiemap_buf); + return exit_code; +} + static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { int ret; @@ -802,6 +895,9 @@ int try_clean_remaps(bool only_ghosts) struct remap_info *ri; int ret = 0; + if (opts.keep_link_remaps) + return ret; + list_for_each_entry(ri, &remaps, list) { if (ri->rpe->remap_type == REMAP_TYPE__GHOST) ret |= clean_one_remap(ri); @@ -910,10 +1006,20 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de goto err_out; } - if (gfe.chunks) - ret = copy_file_to_chunks(fd, img, st->st_size); - else + if (gfe.chunks) { + if (opts.ghost_fiemap) { + ret = copy_file_to_chunks_fiemap(fd, img, st->st_size); + if (ret == -EOPNOTSUPP) { + pr_debug("file system don't support fiemap\n"); + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { ret = copy_file(fd, img_raw_fd(img), st->st_size); + } + close(fd); if (ret) goto err_out; @@ -946,8 +1052,8 @@ static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); - if (st->st_size > opts.ghost_limit) { - pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_size); + if (st->st_blocks * ST_UNIT > opts.ghost_limit) { + pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_blocks * ST_UNIT); return -1; } @@ -1688,6 +1794,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) int ret; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; + bool skip_for_shell_job = false; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) @@ -1707,11 +1814,15 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) mi = lookup_mnt_id(p->mnt_id); if (mi == NULL) { - pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); - return -1; + if (opts.shell_job && is_tty(p->stat.st_rdev, p->stat.st_dev)) { + skip_for_shell_job = true; + } else { + pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); + return -1; + } } - if (mnt_is_overmounted(mi)) { + if (!skip_for_shell_job && mnt_is_overmounted(mi)) { pr_err("Open files on overmounted mounts are not supported yet\n"); return -1; } @@ -1731,7 +1842,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) return -1; } - if (check_path_remap(link, p, lfd, id, mi->nsid)) + if (!skip_for_shell_job && check_path_remap(link, p, lfd, id, mi->nsid)) return -1; rfe.name = &link->name[1]; ext: @@ -2199,9 +2310,21 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil if (!validate_file(tmp, &st, rfi)) goto err; - if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { - pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); - goto err; + if (rfi->rfe->has_mode) { + mode_t curr_mode = st.st_mode; + mode_t saved_mode = rfi->rfe->mode; + + if (opts.skip_file_rwx_check) { + curr_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + saved_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + } + + if (curr_mode != saved_mode) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n" + "File r/w/x checks can be skipped with the --skip-file-rwx-check option\n", + rfi->path, (int)curr_mode, saved_mode); + goto err; + } } /* diff --git a/criu/files.c b/criu/files.c index 8a2250e193..38dc076d20 100644 --- a/criu/files.c +++ b/criu/files.c @@ -21,7 +21,7 @@ #include "image.h" #include "common/list.h" #include "rst-malloc.h" -#include "util-pie.h" +#include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" @@ -1346,10 +1346,35 @@ static int fchroot(int fd) return chroot("."); } +static int need_chroot(int saved_root) +{ + struct stat saved_root_stat, cur_root_stat; + int psd; + + if (fstat(saved_root, &saved_root_stat) == -1) { + pr_perror("Failed to stat saved root dir"); + return -1; + } + + psd = open_pid_proc(PROC_SELF); + if (psd < 0) { + pr_perror("Failed to open PROC_SELF"); + return -1; + } + + if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { + pr_perror("Failed to stat current root dir"); + return -1; + } + + return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; +} + int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); + bool do_chroot = true; /* * First -- open both descriptors. We will not @@ -1368,15 +1393,24 @@ int restore_fs(struct pstree_item *me) goto out; } + /* + * In unprivileged mode chroot() may fail if we don't have + * sufficient privileges, therefore only do it if the process + * is actually chrooted. + */ + if (opts.unprivileged) + do_chroot = need_chroot(dd_root); + /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ - - ret = fchroot(dd_root); - if (ret < 0) { - pr_perror("Can't change root"); - goto out; + if (do_chroot) { + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; + } } ret = fchdir(dd_cwd); diff --git a/criu/hugetlb.c b/criu/hugetlb.c index aa98662d81..866c4050fd 100644 --- a/criu/hugetlb.c +++ b/criu/hugetlb.c @@ -35,6 +35,19 @@ int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag) return 0; } +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma) +{ + /* + * Dump the hugetlb backed mapping using memfd_hugetlb when it is not + * anonymous private mapping. + */ + if (kdat.has_memfd_hugetlb && is_hugetlb_dev(dev, hugetlb_size_flag) && + !((vma->e->flags & MAP_PRIVATE) && !strncmp(file_path, ANON_HUGEPAGE_PREFIX, ANON_HUGEPAGE_PREFIX_LEN))) + return 1; + + return 0; +} + unsigned long get_size_from_hugetlb_flag(int flag) { int i; diff --git a/criu/image.c b/criu/image.c index 353de48e8f..9fb390ab7e 100644 --- a/criu/image.c +++ b/criu/image.c @@ -226,8 +226,9 @@ int prepare_inventory(InventoryEntry *he) if (get_task_ids(&crt.i)) return -1; - he->has_root_cg_set = true; - if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) + if (!opts.unprivileged) + he->has_root_cg_set = true; + if (dump_thread_cgroup(NULL, &he->root_cg_set, NULL, -1)) return -1; he->root_ids = crt.i.ids; diff --git a/criu/include/aio.h b/criu/include/aio.h index d1655739d9..38e7040209 100644 --- a/criu/include/aio.h +++ b/criu/include/aio.h @@ -1,7 +1,7 @@ #ifndef __CR_AIO_H__ #define __CR_AIO_H__ -#include +#include "linux/aio_abi.h" #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); diff --git a/criu/include/cgroup-props.h b/criu/include/cgroup-props.h index 11b6775483..10a7061b80 100644 --- a/criu/include/cgroup-props.h +++ b/criu/include/cgroup-props.h @@ -10,6 +10,7 @@ typedef struct { } cgp_t; extern cgp_t cgp_global; +extern cgp_t cgp_global_v2; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 2e9b8933ce..93f61539cf 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -7,7 +7,7 @@ struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; -int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); +int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); int prepare_task_cgroup(struct pstree_item *); int prepare_cgroup(void); @@ -60,6 +60,9 @@ struct cg_controller { /* for cgroup list in cgroup.c */ struct list_head l; + + /* controller is a threaded cgroup or not */ + int is_threaded; }; struct cg_controller *new_controller(const char *name); @@ -87,9 +90,12 @@ struct cg_ctl { */ struct list_head; struct parasite_dump_cgroup_args; -extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); +extern int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *l, + unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); +int stop_cgroupd(void); + #endif /* __CR_CGROUP_H__ */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index bf1a762cc6..e19ca370f6 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -1,10 +1,12 @@ #ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ -#include #include +#include #include "common/config.h" #include "common/list.h" +#include "int.h" +#include "image.h" /* Configuration and CLI parsing order defines */ #define PARSING_GLOBAL_CONF 1 @@ -93,6 +95,9 @@ enum FILE_VALIDATION_OPTIONS { /* This constant dictates which file validation method should be tried by default. */ #define FILE_VALIDATION_DEFAULT FILE_VALIDATION_BUILD_ID +/* This constant dictates that criu use fiemap to copy ghost file by default.*/ +#define FIEMAP_DEFAULT 1 + struct irmap; struct irmap_path_opt { @@ -165,6 +170,7 @@ struct cr_options { int enable_external_masters; bool aufs; /* auto-detected, not via cli */ bool overlayfs; + int ghost_fiemap; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED bool has_binfmt_misc; /* auto-detected */ #endif @@ -179,6 +185,7 @@ struct cr_options { bool lazy_pages; char *work_dir; int network_lock_method; + int skip_file_rwx_check; /* * When we scheduler for removal some functionality we first @@ -209,6 +216,32 @@ struct cr_options { enum criu_mode mode; int mntns_compat_mode; + + /* Remember the program name passed to main() so we can use it in + * error messages elsewhere. + */ + char *argv_0; + /* + * This contains the eUID of the current CRIU user. It + * will only be set to a non-zero value if CRIU has + * the necessary capabilities to run as non root. + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN + */ + uid_t uid; + /* This contains the value from capget()->effective */ + u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; + /* + * If CRIU should be running as non-root with the help of + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should + * explicitly request it as it comes with many limitations. + */ + int unprivileged; + + /* + * On restore, do not remove link-remaps. This allows a checkpoint taken with --link-remap + * to be reused. + */ + int keep_link_remaps; }; extern struct cr_options opts; diff --git a/criu/include/crtools.h b/criu/include/crtools.h index b9309654f9..b54b9d9294 100644 --- a/criu/include/crtools.h +++ b/criu/include/crtools.h @@ -26,6 +26,7 @@ extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); +extern int check_caps(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index f33918de86..69d670be93 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -24,14 +24,6 @@ enum faults { static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { - /* - * Temporary workaround for Xen guests. Breakpoints degrade - * performance linearly, so until we find out the reason, - * let's disable them. - */ - if (f == FI_NO_BREAKPOINTS) - return true; - return fi_strategy == f; } diff --git a/criu/include/hugetlb.h b/criu/include/hugetlb.h index c0e83652b7..9aee5bed35 100644 --- a/criu/include/hugetlb.h +++ b/criu/include/hugetlb.h @@ -4,6 +4,11 @@ #include #include +#include "vma.h" + +#define ANON_HUGEPAGE_PREFIX "/anon_hugepage" +#define ANON_HUGEPAGE_PREFIX_LEN (sizeof(ANON_HUGEPAGE_PREFIX) - 1) + enum hugepage_size { HUGETLB_16KB, HUGETLB_64KB, @@ -46,6 +51,7 @@ struct htlb_info { extern struct htlb_info hugetlb_info[HUGETLB_MAX]; int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag); +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma); unsigned long get_size_from_hugetlb_flag(int flag); #ifndef MFD_HUGETLB diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 83d867e75b..a3959c9926 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -7,6 +7,7 @@ #include "asm/kerndat.h" #include "util-vdso.h" #include "hugetlb.h" +#include struct stat; @@ -82,6 +83,7 @@ struct kerndat_s { bool has_openat2; bool has_rseq; bool has_ptrace_get_rseq_conf; + struct __ptrace_rseq_configuration libc_rseq_conf; }; extern struct kerndat_s kdat; diff --git a/criu/include/linux/aio_abi.h b/criu/include/linux/aio_abi.h new file mode 100644 index 0000000000..d9ce787203 --- /dev/null +++ b/criu/include/linux/aio_abi.h @@ -0,0 +1,14 @@ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +typedef __kernel_ulong_t aio_context_t; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#endif /* __LINUX__AIO_ABI_H */ diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index 9a3a28b100..fefafa89e1 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -4,32 +4,40 @@ #include "common/config.h" #include "compel/plugins/std/syscall-codes.h" -#ifdef CONFIG_HAS_FSCONFIG -#include -#else +/* Copied from /usr/include/sys/mount.h */ + +#ifndef FSOPEN_CLOEXEC +/* The type of fsconfig call made. */ enum fsconfig_command { - FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ - FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ - FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ - FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ - FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ - FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ +#define FSCONFIG_SET_FLAG FSCONFIG_SET_FLAG + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ +#define FSCONFIG_SET_STRING FSCONFIG_SET_STRING + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ +#define FSCONFIG_SET_BINARY FSCONFIG_SET_BINARY + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ +#define FSCONFIG_SET_PATH FSCONFIG_SET_PATH + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ +#define FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_PATH_EMPTY + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ +#define FSCONFIG_SET_FD FSCONFIG_SET_FD + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ +#define FSCONFIG_CMD_CREATE FSCONFIG_CMD_CREATE FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +#define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; -#endif -static inline int sys_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} -static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - return syscall(__NR_fsconfig, fd, cmd, key, value, aux); -} -static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - return syscall(__NR_fsmount, fd, flags, attr_flags); -} +#endif // FSOPEN_CLOEXEC + +/* fsopen flags. With the redundant definition, we check if the kernel, + * glibc value and our value still match. + */ +#define FSOPEN_CLOEXEC 0x00000001 + +#ifndef MS_MGC_VAL +/* Magic mount flag number. Has to be or-ed to the flag values. */ +#define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */ +#define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */ +#endif #endif diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h index a47876e669..5ceefbf8e1 100644 --- a/criu/include/linux/rseq.h +++ b/criu/include/linux/rseq.h @@ -9,7 +9,12 @@ #endif #endif -#ifndef __GLIBC_HAVE_KERNEL_RSEQ +#include +#include + +#include "common/config.h" + +#ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS /* * linux/rseq.h * @@ -18,9 +23,6 @@ * Copyright (c) 2015-2018 Mathieu Desnoyers */ -#include -#include - enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, RSEQ_CPU_ID_REGISTRATION_FAILED = -2, @@ -41,13 +43,20 @@ enum rseq_cs_flags { RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), }; +#endif /* CONFIG_HAS_NO_LIBC_RSEQ_DEFS */ +/* + * Let's use our own definition of struct rseq_cs because some distros + * (for example Mariner GNU/Linux) declares this structure their-own way. + * This makes trouble with inconsistency between printf formatters and + * struct rseq_cs field types. + */ /* * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. It is usually declared as * link-time constant data. */ -struct rseq_cs { +struct criu_rseq_cs { /* Version of this structure. */ __u32 version; /* enum rseq_cs_flags */ @@ -57,7 +66,6 @@ struct rseq_cs { __u64 post_commit_offset; __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); -#endif /* __GLIBC_HAVE_KERNEL_RSEQ */ /* * We have to have our own copy of struct rseq definition because diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index e2ea6e17f6..183a3b8526 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -1,6 +1,8 @@ #ifndef __CR_NS_H__ #define __CR_NS_H__ +#include + #include "common/compiler.h" #include "files.h" #include "common/list.h" @@ -224,4 +226,19 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + +extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); +extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); +extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); + #endif /* __CR_NS_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 1bcd4ff205..36fe670928 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -10,7 +10,7 @@ struct ps_info { extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); /* User buffer for read-mode pre-dump*/ -#define BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) +#define PIPE_MAX_BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) /* * page_xfer -- transfer pages into image file. diff --git a/criu/include/parasite.h b/criu/include/parasite.h index d2a06889f6..787c927be9 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -241,7 +241,12 @@ struct parasite_dump_cgroup_args { * * The string is null terminated. */ - char contents[1 << 12]; + char contents[(1 << 12) - 32]; + /* + * Contains the path to thread cgroup procfs. + * "self/task//cgroup" + */ + char thread_cgrp[32]; }; #endif /* !__ASSEMBLY__ */ diff --git a/criu/include/pid.h b/criu/include/pid.h index 49cb2d322e..b2b7a361a6 100644 --- a/criu/include/pid.h +++ b/criu/include/pid.h @@ -31,6 +31,10 @@ struct pid { pid_t real; int state; /* TASK_XXX constants */ + /* If an item is in stopped state it has a signal number + * that caused task to stop. + */ + int stop_signo; /* * The @virt pid is one which used in the image itself and keeps diff --git a/criu/include/plugin.h b/criu/include/plugin.h index a1796b6418..0115e6ea01 100644 --- a/criu/include/plugin.h +++ b/criu/include/plugin.h @@ -5,7 +5,9 @@ #include "common/compiler.h" #include "common/list.h" -#define CR_PLUGIN_DEFAULT "/var/lib/criu/" +#ifndef CR_PLUGIN_DEFAULT +#define CR_PLUGIN_DEFAULT "/usr/lib/criu/" +#endif void cr_plugin_fini(int stage, int err); int cr_plugin_init(int stage); diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 8ae750e1af..1137046d43 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -63,7 +63,7 @@ struct dmp_info { struct parasite_ctl *parasite_ctl; struct parasite_thread_ctl **thread_ctls; uint64_t *thread_sp; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* * Although we don't support dumping different struct creds in general, diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 325804e449..bc0beb5cbb 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -121,6 +121,8 @@ struct thread_restore_args { bool seccomp_force_tsync; char comm[TASK_COMM_LEN]; + int cg_set; + int cgroupd_sk; } __aligned(64); typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); @@ -235,6 +237,9 @@ struct task_restore_args { * unregister it before memory restoration procedure */ struct rst_rseq_param libc_rseq; + + uid_t uid; + u32 cap_eff[CR_CAP_SIZE]; } __aligned(64); /* diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index c6979de7f4..4265d94edd 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -24,6 +24,7 @@ enum sfd_type { */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, + CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ USERNSD_SK, /* Socket for usernsd */ NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ diff --git a/criu/include/sockets.h b/criu/include/sockets.h index 399d38664c..c3e7c879a7 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -27,6 +27,7 @@ struct socket_desc { extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); diff --git a/criu/include/syscall.h b/criu/include/syscall.h new file mode 100644 index 0000000000..c38d6d971b --- /dev/null +++ b/criu/include/syscall.h @@ -0,0 +1,17 @@ +#ifndef __CR_SYSCALL_H__ +#define __CR_SYSCALL_H__ + +static inline int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} +static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} +static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +#endif /* __CR_SYSCALL_H__ */ \ No newline at end of file diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index ac7924dcdb..cb3eba8174 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -34,8 +34,9 @@ enum { /* * Some entries might be missing mark them as optional. */ -#define CTL_FLAGS_OPTIONAL 1 -#define CTL_FLAGS_HAS 2 -#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_OPTIONAL 1 +#define CTL_FLAGS_HAS 2 +#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_IPC_EACCES_SKIP 5 #endif /* __CR_SYSCTL_H__ */ diff --git a/criu/include/util-caps.h b/criu/include/util-caps.h new file mode 100644 index 0000000000..7ccd162f5e --- /dev/null +++ b/criu/include/util-caps.h @@ -0,0 +1,58 @@ +#ifndef __CR_UTIL_CAPS_H__ +#define __CR_UTIL_CAPS_H__ + +#include + +#ifndef CAP_CHECKPOINT_RESTORE +#define CAP_CHECKPOINT_RESTORE 40 +#endif + +static inline bool has_capability(int cap, u32 *cap_eff) +{ + int mask = CAP_TO_MASK(cap); + int index = CAP_TO_INDEX(cap); + u32 effective; + + effective = cap_eff[index]; + + if (!(mask & effective)) { + pr_debug("Effective capability %d missing\n", cap); + return false; + } + + return true; +} + +static inline bool has_cap_checkpoint_restore(u32 *cap_eff) +{ + /* + * Everything guarded by CAP_CHECKPOINT_RESTORE is also + * guarded by CAP_SYS_ADMIN. Check for both capabilities. + */ + if (has_capability(CAP_CHECKPOINT_RESTORE, cap_eff) || has_capability(CAP_SYS_ADMIN, cap_eff)) + return true; + + return false; +} + +static inline bool has_cap_net_admin(u32 *cap_eff) +{ + return has_capability(CAP_NET_ADMIN, cap_eff); +} + +static inline bool has_cap_sys_chroot(u32 *cap_eff) +{ + return has_capability(CAP_SYS_CHROOT, cap_eff); +} + +static inline bool has_cap_setuid(u32 *cap_eff) +{ + return has_capability(CAP_SETUID, cap_eff); +} + +static inline bool has_cap_sys_resource(u32 *cap_eff) +{ + return has_capability(CAP_SYS_RESOURCE, cap_eff); +} + +#endif /* __CR_UTIL_CAPS_H__ */ diff --git a/criu/include/util.h b/criu/include/util.h index 4e29c079ef..3a0403113e 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -386,6 +386,8 @@ extern int mount_detached_fs(const char *fsname); extern char *get_legacy_iptables_bin(bool ipv6); +extern int set_opts_cap_eff(void); + extern ssize_t read_all(int fd, void *buf, size_t size); extern ssize_t write_all(int fd, const void *buf, size_t size); diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c index 4fe082fbbc..7e95be8c52 100644 --- a/criu/ipc_ns.c +++ b/criu/ipc_ns.c @@ -292,6 +292,8 @@ static void pr_info_ipc_shm(const IpcShmEntry *shm) static int ipc_sysctl_req(IpcVarEntry *e, int op) { + int i; + struct sysctl_req req[] = { { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, @@ -332,6 +334,9 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op) if (e->has_shm_next_id) req[nr++] = req[16]; + for (i = 0; i < nr; i++) + req[i].flags = CTL_FLAGS_IPC_EACCES_SKIP; + return sysctl_op(req, nr, op, CLONE_NEWIPC); } @@ -570,7 +575,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { - { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, + { "kernel/sem_next_id", &sem->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct semid_ds semid; @@ -703,7 +708,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { - { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, + { "kernel/msg_next_id", &msq->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct msqid_ds msqid; @@ -841,7 +846,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { int ret, id, hugetlb_flag = 0; struct sysctl_req req[] = { - { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, + { "kernel/shm_next_id", &shm->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct shmid_ds shmid; diff --git a/criu/kerndat.c b/criu/kerndat.c index b8b6bc95d7..5b567e79ff 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -21,6 +21,7 @@ #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include #endif +#include #include "common/config.h" #include "int.h" @@ -51,6 +52,7 @@ #include "sched.h" #include "memfd.h" #include "mount-v2.h" +#include "util-caps.h" struct kerndat_s kdat = {}; @@ -420,10 +422,6 @@ static int kerndat_get_dirty_track(void) } else { no_dt: pr_info("Dirty tracking support is OFF\n"); - if (opts.track_mem) { - pr_err("Tracking memory is not available\n"); - return -1; - } } return 0; @@ -502,7 +500,7 @@ static bool kerndat_has_memfd_hugetlb(void) if (ret >= 0) { kdat.has_memfd_hugetlb = true; close(ret); - } else if (ret == -1 && (errno == EINVAL || errno == ENOENT)) { + } else if (ret == -1 && (errno == EINVAL || errno == ENOENT || errno == ENOSYS)) { kdat.has_memfd_hugetlb = false; } else { pr_perror("Unexpected error from memfd_create(\"\", MFD_HUGETLB)"); @@ -927,6 +925,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void) pid_t pid; int len; struct __ptrace_rseq_configuration rseq; + int ret = 0; pid = fork_and_ptrace_attach(NULL); if (pid < 0) @@ -934,6 +933,9 @@ static int kerndat_has_ptrace_get_rseq_conf(void) len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); if (len != sizeof(rseq)) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); goto out; @@ -944,16 +946,27 @@ static int kerndat_has_ptrace_get_rseq_conf(void) * we need to pay attention to that and, possibly, make changes on the CRIU side. */ if (rseq.flags != 0) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); } else { + if (!kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = true; + + if (memcmp(&kdat.libc_rseq_conf, &rseq, sizeof(rseq))) + ret = 1; /* we should update kdat */ + + kdat.libc_rseq_conf = rseq; } out: kill(pid, SIGKILL); waitpid(pid, NULL, 0); - return 0; + return ret; } int kerndat_sockopt_buf_lock(void) @@ -1064,19 +1077,66 @@ static int kerndat_has_openat2(void) return 0; } -#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" -#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" +#define KERNDAT_CACHE_NAME "criu.kdat" +#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME + +/* + * Returns: + * -1 if kdat_file was not written due to error + * 0 if kdat_file was written + * 1 if kdat_file was not written because cache directory undefined in env (non-root mode) + */ +static int get_kerndat_filename(char **kdat_file) +{ + int ret; + /* + * Running as non-root, even with CAP_CHECKPOINT_RESTORE, does not + * allow to write to KDAT_RUNDIR which usually is only writable by root. + * Let's write criu.kdat file to XDG_RUNTIME_DIR for non-root cases. + * Note that XDG_RUNTIME_DIR is not always defined (e.g. when executing + * via su/sudo). + */ + if (opts.unprivileged) { + const char *cache_dir = getenv("XDG_RUNTIME_DIR"); + if (!cache_dir) { + pr_warn("$XDG_RUNTIME_DIR not set. Cannot find location for kerndat file\n"); + return 1; + } + ret = asprintf(kdat_file, "%s/%s", cache_dir, KERNDAT_CACHE_NAME); + } else { + ret = asprintf(kdat_file, "%s", KERNDAT_CACHE_FILE); + } + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return -1; + } + + return 0; +} + +/* + * Returns: + * -1 if error + * 0 if cache was loaded + * 1 if cache does not exist or is stale or cache directory undefined in env (non-root mode) + */ static int kerndat_try_load_cache(void) { + cleanup_free char *kdat_file = NULL; int fd, ret; - fd = open(KERNDAT_CACHE_FILE, O_RDONLY); + ret = get_kerndat_filename(&kdat_file); + if (ret) + return ret; + + fd = open(kdat_file, O_RDONLY); if (fd < 0) { if (ENOENT == errno) - pr_debug("File %s does not exist\n", KERNDAT_CACHE_FILE); + pr_debug("File %s does not exist\n", kdat_file); else - pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); + pr_warn("Can't load %s\n", kdat_file); return 1; } @@ -1090,12 +1150,12 @@ static int kerndat_try_load_cache(void) close(fd); if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { - pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); - unlink(KERNDAT_CACHE_FILE); + pr_warn("Stale %s file\n", kdat_file); + unlink(kdat_file); return 1; } - pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); + pr_info("Loaded kdat cache from %s\n", kdat_file); return 0; } @@ -1103,8 +1163,20 @@ static void kerndat_save_cache(void) { int fd, ret; struct statfs s; + cleanup_free char *kdat_file = NULL; + cleanup_free char *kdat_file_tmp = NULL; + + if (get_kerndat_filename(&kdat_file)) + return; + + ret = asprintf(&kdat_file_tmp, "%s.tmp", kdat_file); - fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return; + } + + fd = open(kdat_file_tmp, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu @@ -1113,6 +1185,10 @@ static void kerndat_save_cache(void) */ return; + /* + * If running as root we store the cache file on a tmpfs (/run), + * because the file should be gone after reboot. + */ if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); @@ -1126,20 +1202,21 @@ static void kerndat_save_cache(void) */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; + ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) - ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); + ret = rename(kdat_file_tmp, kdat_file); else { ret = -1; errno = EIO; } if (ret < 0) { - pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); + pr_perror("Couldn't save %s", kdat_file); unl: - unlink(KERNDAT_CACHE_FILE_TMP); + unlink(kdat_file); } } @@ -1147,6 +1224,14 @@ static int kerndat_uffd(void) { int uffd, err = 0; + if (opts.unprivileged) + /* + * If running as non-root uffd_open() fails with + * 'Operation not permitted'. Just ignore uffd for + * non-root for now. + */ + return 0; + kdat.uffd_features = 0; uffd = uffd_open(0, &kdat.uffd_features, &err); @@ -1476,12 +1561,57 @@ int kerndat_try_load_new(void) if (ret < 0) return ret; + ret = kerndat_has_ptrace_get_rseq_conf(); + if (ret < 0) { + pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); + return ret; + } + /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); return 0; } +static int root_only_init(void) +{ + int ret = 0; + + if (opts.unprivileged) + return 0; + + if (!ret && kerndat_loginuid()) { + pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_tun_netns()) { + pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_unix_file()) { + pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_link_nsid()) { + pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_netns()) { + pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_nftables_concat()) { + pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_move_mount_set_group()) { + pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); + ret = -1; + } + + return ret; +} + int kerndat_init(void) { int ret; @@ -1499,7 +1629,16 @@ int kerndat_init(void) memset(&kdat, 0, sizeof(kdat)); preload_socket_modules(); - preload_netfilter_modules(); + if (!opts.unprivileged) + /* + * This uses 'iptables -L' to implicitly load necessary modules. + * If the non nft backed iptables is used it does a + * openat(AT_FDCWD, "/run/xtables.lock", O_RDONLY|O_CREAT, 0600) = -1 EACCES + * which will fail as non-root. There are no capabilities to + * change this. The iptables nft backend fails with + * openat(AT_FDCWD, "/proc/net/ip_tables_names", O_RDONLY) = -1 EACCES + */ + preload_netfilter_modules(); if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); @@ -1537,10 +1676,14 @@ int kerndat_init(void) pr_err("get_ipv6 failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_loginuid()) { - pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + if (!ret && kerndat_nsid()) { + pr_err("kerndat_nsid failed when initializing kerndat.\n"); ret = -1; } + + if (!ret && root_only_init()) + ret = -1; + if (!ret && kerndat_iptables_has_xtlocks()) { pr_err("kerndat_iptables_has_xtlocks failed when initializing kerndat.\n"); ret = -1; @@ -1553,22 +1696,6 @@ int kerndat_init(void) pr_err("kerndat_compat_restore failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_tun_netns()) { - pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_socket_unix_file()) { - pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_nsid()) { - pr_err("kerndat_nsid failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_link_nsid()) { - pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_memfd_create()) { pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); ret = -1; @@ -1599,10 +1726,6 @@ int kerndat_init(void) pr_err("kerndat_vdso_preserves_hint failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_socket_netns()) { - pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_x86_has_ptrace_fpu_xsave_bug()) { pr_err("kerndat_x86_has_ptrace_fpu_xsave_bug failed when initializing kerndat.\n"); ret = -1; @@ -1627,7 +1750,7 @@ int kerndat_init(void) pr_err("has_time_namespace failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_newifindex()) { + if (!ret && (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) && kerndat_has_newifindex()) { pr_err("kerndat_has_newifindex failed when initializing kerndat.\n"); ret = -1; } @@ -1641,18 +1764,10 @@ int kerndat_init(void) pr_err("kerndat_has_nspid failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_nftables_concat()) { - pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_sockopt_buf_lock()) { pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_move_mount_set_group()) { - pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_openat2()) { pr_err("kerndat_has_openat2 failed when initializing kerndat.\n"); ret = -1; @@ -1661,7 +1776,7 @@ int kerndat_init(void) pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_ptrace_get_rseq_conf()) { + if (!ret && (kerndat_has_ptrace_get_rseq_conf() < 0)) { pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ret = -1; } diff --git a/criu/mem.c b/criu/mem.c index 136439518f..ab86a1f6d7 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -246,6 +246,12 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; + /* + * We totally ignore MAP_HUGETLB on pre-dump. + * See also generate_vma_iovs() comment. + */ + if ((vma->e->flags & MAP_HUGETLB) && skip_non_trackable) + continue; if (vma->e->prot & PROT_READ) continue; @@ -402,7 +408,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { + /* + * We want to completely ignore these VMA types on the pre-dump: + * 1. VMA_AREA_AIORING because it is not soft-dirty trackable (kernel writes) + * 2. MAP_HUGETLB mappings because they are not premapped and we can't use + * parent images from pre-dump stages. Instead, the content is restored from + * the parasite context using full memory image. + */ + if (vma_entry_is(vma->e, VMA_AREA_AIORING) || vma->e->flags & MAP_HUGETLB) { if (pre_dump) return 0; has_parent = false; diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 623016d428..5d53e9a226 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -925,27 +925,25 @@ static int move_mount_set_group(int src_id, char *source, int dst_id) return 0; } -static int restore_one_sharing_group(struct sharing_group *sg) +static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) { - struct mount_info *first, *other; - char first_path[PATH_MAX]; - int first_fd; + char target_path[PATH_MAX]; + int target_fd; - first = get_first_mount(sg); - first_fd = fdstore_get(first->mnt_fd_id); - BUG_ON(first_fd < 0); - snprintf(first_path, sizeof(first_path), "/proc/self/fd/%d", first_fd); + target_fd = fdstore_get(target->mnt_fd_id); + BUG_ON(target_fd < 0); + snprintf(target_path, sizeof(target_path), "/proc/self/fd/%d", target_fd); - /* Restore first's master_id from shared_id of the source */ + /* Restore target's master_id from shared_id of the source */ if (sg->master_id) { if (sg->parent) { - struct mount_info *p; + struct mount_info *first; /* Get shared_id from parent sharing group */ - p = get_first_mount(sg->parent); - if (move_mount_set_group(p->mnt_fd_id, NULL, first->mnt_fd_id)) { - pr_err("Failed to copy sharing from %d to %d\n", p->mnt_id, first->mnt_id); - close(first_fd); + first = get_first_mount(sg->parent); + if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); + close(target_fd); return -1; } } else { @@ -956,39 +954,77 @@ static int restore_one_sharing_group(struct sharing_group *sg) * or non-shared slave). If source is a private mount * we would fail. */ - if (move_mount_set_group(-1, sg->source, first->mnt_fd_id)) { - pr_err("Failed to copy sharing from source %s to %d\n", sg->source, first->mnt_id); - close(first_fd); + if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { + pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); + close(target_fd); return -1; } } /* Convert shared_id to master_id */ - if (mount(NULL, first_path, NULL, MS_SLAVE, NULL)) { - pr_perror("Failed to make mount %d slave", first->mnt_id); - close(first_fd); + if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { + pr_perror("Failed to make mount %d slave", target->mnt_id); + close(target_fd); return -1; } } - /* Restore first's shared_id */ + /* Restore target's shared_id */ if (sg->shared_id) { - if (mount(NULL, first_path, NULL, MS_SHARED, NULL)) { - pr_perror("Failed to make mount %d shared", first->mnt_id); - close(first_fd); + if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { + pr_perror("Failed to make mount %d shared", target->mnt_id); + close(target_fd); return -1; } } - close(first_fd); + close(target_fd); + + return 0; +} + +static int restore_one_sharing_group(struct sharing_group *sg) +{ + struct mount_info *first, *other; + + first = get_first_mount(sg); + + if (restore_one_sharing(sg, first)) + return -1; /* Restore sharing for other mounts from the sharing group */ list_for_each_entry(other, &sg->mnt_list, mnt_sharing) { if (other == first) continue; - if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { - pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); - return -1; + if (is_sub_path(other->root, first->root)) { + if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); + return -1; + } + } else { + /* + * Case where mounts of this sharing group don't have common root. + * For instance we can create two sub-directories .a and .b in some + * shared mount, bindmount them separately somethere and umount the + * original mount. Now we have both bindmounts shared between each + * other. Kernel only allows to copy sharing between mounts when + * source root contains destination root, which is not true for + * these two, so we can't just copy from first to other. + * + * For external sharing (!sg->parent) with only master_id (shared_id + * == 0) we can workaround this by copying from their external source + * instead (same as we did for a first mount). + * + * This is a w/a runc usecase, see https://github.com/opencontainers/runc/pull/3442 + */ + if (!sg->parent && !sg->shared_id) { + if (restore_one_sharing(sg, other)) + return -1; + } else { + pr_err("Can't copy sharing from %d[%s] to %d[%s]\n", first->mnt_id, first->root, + other->mnt_id, other->root); + return -1; + } } } diff --git a/criu/namespaces.c b/criu/namespaces.c index 7356fe8c2f..0dc19d5b60 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -28,6 +27,7 @@ #include "cgroup.h" #include "fdstore.h" #include "kerndat.h" +#include "util-caps.h" #include "protobuf.h" #include "util.h" @@ -1217,20 +1217,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) return 0; } -struct unsc_msg { - struct msghdr h; - /* - * 0th is the call address - * 1st is the flags - * 2nd is the optional (NULL in response) arguments - */ - struct iovec iov[3]; - char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; -}; - static int usernsd_pid; -static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd) +inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) { struct cmsghdr *ch; struct ucred *ucred; @@ -1268,7 +1257,10 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *)CMSG_DATA(ch); - ucred->pid = getpid(); + if (pid) + ucred->pid = *pid; + else + ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); @@ -1283,7 +1275,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void } } -static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1321,7 +1313,7 @@ static int usernsd(int sk) int flags, fd, ret; pid_t pid; - unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; @@ -1366,7 +1358,7 @@ static int usernsd(int sk) else fd = -1; - unsc_msg_init(&um, &call, &ret, NULL, 0, fd); + unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; @@ -1417,7 +1409,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Send the request */ - unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); @@ -1432,7 +1424,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Get the response back */ - unsc_msg_init(&um, &call, &res, NULL, 0, 0); + unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); @@ -1453,14 +1445,11 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, return ret; } -static int start_usernsd(void) +int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) { int sk[2]; int one = 1; - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - /* * Seqpacket to * @@ -1489,24 +1478,39 @@ static int start_usernsd(void) return -1; } - usernsd_pid = fork(); - if (usernsd_pid < 0) { - pr_perror("Can't fork usernsd"); + *pid = fork(); + if (*pid < 0) { + pr_perror("Can't unix daemon"); close(sk[0]); close(sk[1]); return -1; } - if (usernsd_pid == 0) { + if (*pid == 0) { int ret; - close(sk[0]); - ret = usernsd(sk[1]); + ret = daemon_func(sk[1]); exit(ret); } - close(sk[1]); - if (install_service_fd(USERNSD_SK, sk[0]) < 0) { + + return sk[0]; +} + +static int start_usernsd(void) +{ + int sk; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + sk = start_unix_cred_daemon(&usernsd_pid, usernsd); + if (sk < 0) { + pr_err("failed to start usernsd\n"); + return -1; + } + + if (install_service_fd(USERNSD_SK, sk) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; @@ -1623,10 +1627,12 @@ int collect_namespaces(bool for_dump) int prepare_userns_creds(void) { - /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ - if (setuid(0) || setgid(0) || setgroups(0, NULL)) { - pr_perror("Unable to initialize id-s"); - return -1; + if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; + } } /* diff --git a/criu/net.c b/criu/net.c index 2eff519c50..f29a166f8e 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3366,7 +3366,7 @@ int collect_net_namespaces(bool for_dump) struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net"); -struct ns_id *net_get_root_ns() +struct ns_id *net_get_root_ns(void) { static struct ns_id *root_netns = NULL; diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 5a7e50bc19..54dc3ccc41 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -56,7 +56,7 @@ static inline int ppb_resize_pipe(struct page_pipe_buf *ppb) if (new_size > PIPE_MAX_SIZE) { if (ppb->pipe_size < PIPE_MAX_SIZE) - ppb->pipe_size = PIPE_MAX_SIZE; + new_size = PIPE_MAX_SIZE; else return 1; } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 2543a462a9..782d4cafce 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -617,31 +617,18 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p */ unsigned long handle_faulty_iov(int pid, struct iovec *riov, unsigned long faulty_index, struct iovec *bufvec, - struct iovec *aux_iov, unsigned long *aux_len, unsigned long partial_read_bytes) + struct iovec *aux_iov, unsigned long *aux_len) { struct iovec dummy; ssize_t bytes_read; - unsigned long offset = 0; unsigned long final_read_cnt = 0; - /* Handling Case 2*/ - if (riov[faulty_index].iov_len == PAGE_SIZE) { - cnt_sub(CNT_PAGES_WRITTEN, 1); - return 0; - } - /* Handling Case 3-Part 3.2*/ - offset = (partial_read_bytes) ? partial_read_bytes : PAGE_SIZE; - - dummy.iov_base = riov[faulty_index].iov_base + offset; - dummy.iov_len = riov[faulty_index].iov_len - offset; - - if (!partial_read_bytes) - cnt_sub(CNT_PAGES_WRITTEN, 1); + dummy.iov_base = riov[faulty_index].iov_base; + dummy.iov_len = riov[faulty_index].iov_len; while (dummy.iov_len) { bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0); - if (bytes_read == -1) { /* Handling faulty page read in faulty iov */ cnt_sub(CNT_PAGES_WRITTEN, 1); @@ -671,14 +658,12 @@ unsigned long handle_faulty_iov(int pid, struct iovec *riov, unsigned long fault /* * This function will position start pointer to the latest - * successfully read iov in iovec. In case of partial read it - * returns partial_read_bytes, otherwise 0. + * successfully read iov in iovec. */ static unsigned long analyze_iov(ssize_t bytes_read, struct iovec *riov, unsigned long *index, struct iovec *aux_iov, unsigned long *aux_len) { ssize_t processed_bytes = 0; - unsigned long partial_read_bytes = 0; /* correlating iovs with read bytes */ while (processed_bytes < bytes_read) { @@ -692,13 +677,17 @@ static unsigned long analyze_iov(ssize_t bytes_read, struct iovec *riov, unsigne /* handling partially processed faulty iov*/ if (processed_bytes - bytes_read) { + unsigned long partial_read_bytes = 0; + (*index) -= 1; partial_read_bytes = riov[*index].iov_len - (processed_bytes - bytes_read); aux_iov[*aux_len - 1].iov_len = partial_read_bytes; + riov[*index].iov_base += partial_read_bytes; + riov[*index].iov_len -= partial_read_bytes; } - return partial_read_bytes; + return 0; } /* @@ -723,40 +712,36 @@ static long fill_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufve ssize_t bytes_read; unsigned long total_read = 0; unsigned long start = 0; - unsigned long partial_read_bytes = 0; while (start < ppb->nr_segs) { bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start], ppb->nr_segs - start, 0); - if (bytes_read == -1) { + if (errno == ESRCH) { + pr_debug("Target process PID:%d not found\n", pid); + return -ESRCH; + } + if (errno != EFAULT) { + pr_perror("process_vm_readv failed"); + return -1; + } /* Handling Case 1*/ if (riov[start].iov_len == PAGE_SIZE) { cnt_sub(CNT_PAGES_WRITTEN, 1); start += 1; continue; - } else if (errno == ESRCH) { - pr_debug("Target process PID:%d not found\n", pid); - return ESRCH; } + total_read += handle_faulty_iov(pid, riov, start, bufvec, aux_iov, aux_len); + start += 1; + continue; } - partial_read_bytes = 0; - if (bytes_read > 0) { - partial_read_bytes = analyze_iov(bytes_read, riov, &start, aux_iov, aux_len); + if (analyze_iov(bytes_read, riov, &start, aux_iov, aux_len) < 0) + return -1; bufvec->iov_base += bytes_read; bufvec->iov_len -= bytes_read; total_read += bytes_read; } - - /* - * If all iovs not processed in one go, - * it means some iov in between has failed. - */ - if (start < ppb->nr_segs) - total_read += handle_faulty_iov(pid, riov, start, bufvec, aux_iov, aux_len, partial_read_bytes); - - start += 1; } return total_read; @@ -777,40 +762,62 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p struct page_pipe_buf *ppb; unsigned int cur_hole = 0, i; unsigned long ret, bytes_read; + unsigned long userbuf_len; struct iovec bufvec; - struct iovec aux_iov[PIPE_MAX_SIZE]; + struct iovec *aux_iov; unsigned long aux_len; + void *userbuf; - char *userbuf = mmap(NULL, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - + userbuf_len = PIPE_MAX_BUFFER_SIZE; + userbuf = mmap(NULL, userbuf_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (userbuf == MAP_FAILED) { pr_perror("Unable to mmap a buffer"); return -1; } + aux_iov = xmalloc(userbuf_len / PAGE_SIZE * sizeof(aux_iov[0])); + if (!aux_iov) + goto err; list_for_each_entry(ppb, &pp->bufs, l) { + if (ppb->pipe_size * PAGE_SIZE > userbuf_len) { + void *addr; + + addr = mremap(userbuf, userbuf_len, ppb->pipe_size * PAGE_SIZE, MREMAP_MAYMOVE); + if (addr == MAP_FAILED) { + pr_perror("Unable to mmap a buffer"); + goto err; + } + userbuf_len = ppb->pipe_size * PAGE_SIZE; + userbuf = addr; + addr = xrealloc(aux_iov, ppb->pipe_size * sizeof(aux_iov[0])); + if (!addr) + goto err; + aux_iov = addr; + } timing_start(TIME_MEMDUMP); aux_len = 0; - bufvec.iov_len = BUFFER_SIZE; + bufvec.iov_len = userbuf_len; bufvec.iov_base = userbuf; bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len); - - if (bytes_read == ESRCH) { - munmap(userbuf, BUFFER_SIZE); - return -1; + if (bytes_read == -ESRCH) { + timing_stop(TIME_MEMDUMP); + munmap(userbuf, userbuf_len); + xfree(aux_iov); + return 0; } + if (bytes_read < 0) + goto err; bufvec.iov_base = userbuf; bufvec.iov_len = bytes_read; - ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK); + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK | SPLICE_F_GIFT); if (ret == -1 || ret != bytes_read) { pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret); - munmap(userbuf, BUFFER_SIZE); - return -1; + goto err; } timing_stop(TIME_MEMDUMP); @@ -822,10 +829,8 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p u32 flags; ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base); - if (ret) { - munmap(userbuf, BUFFER_SIZE); - return ret; - } + if (ret) + goto err; BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; @@ -833,24 +838,25 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p flags = ppb_xfer_flags(xfer, ppb); - if (xfer->write_pagemap(xfer, &iov, flags)) { - munmap(userbuf, BUFFER_SIZE); - return -1; - } + if (xfer->write_pagemap(xfer, &iov, flags)) + goto err; - if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) { - munmap(userbuf, BUFFER_SIZE); - return -1; - } + if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) + goto err; } timing_stop(TIME_MEMWRITE); } - munmap(userbuf, BUFFER_SIZE); + munmap(userbuf, userbuf_len); + xfree(aux_iov); timing_start(TIME_MEMWRITE); return dump_holes(xfer, pp, &cur_hole, NULL); +err: + munmap(userbuf, userbuf_len); + xfree(aux_iov); + return -1; } int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index ee4fa86f4f..d3541d9969 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -513,6 +513,7 @@ int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_a struct parasite_dump_cgroup_args *ca; ca = compel_parasite_args(ctl, struct parasite_dump_cgroup_args); + memcpy(ca->thread_cgrp, cgroup->thread_cgrp, sizeof(ca->thread_cgrp)); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_CGROUP, ctl); if (ret) { pr_err("Parasite failed to dump /proc/self/cgroup\n"); diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index b15568e08e..9fdc74cb74 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -13,6 +13,7 @@ #include "log.h" #include "util.h" #include "pidfd-store.h" +#include "sockets.h" struct pidfd_entry { pid_t pid; @@ -94,9 +95,7 @@ int init_pidfd_store_sk(pid_t pid, int sk) * This is similar to how fdstore_init() works. */ if (addrlen == sizeof(sa_family_t)) { - if (setsockopt(pidfd_store_sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(pidfd_store_sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(pidfd_store_sk, buf)) { goto err; } diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index e7eb1fcb60..2303f41c39 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -14,6 +13,7 @@ #include "int.h" #include "types.h" #include +#include "linux/mount.h" #include "parasite.h" #include "fcntl.h" #include "prctl.h" @@ -745,7 +745,7 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return -1; } - cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0); + cgroup = sys_openat(proc, args->thread_cgrp, O_RDONLY, 0); sys_close(proc); if (cgroup < 0) { pr_err("can't get /proc/self/cgroup fd\n"); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index f80b68359b..99cff1f7d0 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "linux/userfaultfd.h" @@ -184,7 +185,7 @@ static int lsm_set_label(char *label, char *type, int procfd) return 0; } -static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type) +static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type, uid_t uid) { CredsEntry *ce = &args->creds; int b, i, ret; @@ -211,10 +212,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * lose caps bits when changing xids. */ - ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); - if (ret) { - pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); + if (ret) { + pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); + return -1; + } } /* @@ -252,10 +255,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * special state any longer. */ - ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); - if (ret) { - pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); + return -1; + } } /* @@ -582,6 +587,103 @@ static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sig ARCH_RT_SIGRETURN(new_sp, sigframe); } +static int send_cg_set(int sk, int cg_set) +{ + struct cmsghdr *ch; + struct msghdr h; + /* + * 0th is the dummy call address for compatibility with userns helper + * 1st is the cg_set + */ + struct iovec iov[2]; + char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {}; + int ret, *dummy = NULL; + struct ucred *ucred; + + iov[0].iov_base = &dummy; + iov[0].iov_len = sizeof(dummy); + iov[1].iov_base = &cg_set; + iov[1].iov_len = sizeof(cg_set); + + h.msg_iov = iov; + h.msg_iovlen = sizeof(iov) / sizeof(struct iovec); + h.msg_name = NULL; + h.msg_namelen = 0; + h.msg_flags = 0; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_CREDENTIALS; + + ucred = (struct ucred *)CMSG_DATA(ch); + /* + * We still have privilege in this namespace so we can send + * thread id instead of pid of main thread, uid, gid as 0 + * since these 2 are ignored in cgroupd + */ + ucred->pid = sys_gettid(); + ucred->uid = 0; + ucred->gid = 0; + + ret = sys_sendmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to send packet to cgroupd %d\n", ret); + return -1; + } + + return 0; +} + +/* + * As this socket is shared among threads, recvmsg(MSG_PEEK) + * from the socket until getting its own thread id as an + * acknowledge of successful threaded cgroup fixup + */ +static int recv_cg_set_restore_ack(int sk) +{ + struct cmsghdr *ch; + struct msghdr h = {}; + char cmsg[CMSG_SPACE(sizeof(struct ucred))]; + struct ucred *cred; + int ret; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + + while (1) { + ret = sys_recvmsg(sk, &h, MSG_PEEK); + if (ret < 0) { + pr_err("Unable to peek from cgroupd %d\n", ret); + return -1; + } + + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } + + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) + continue; + + /* + * Actual remove message from recv queue of socket + */ + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } + + break; + } + return 0; +} + /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. @@ -609,6 +711,15 @@ long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; + if (args->cg_set != -1) { + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); + if (send_cg_set(args->cgroupd_sk, args->cg_set)) + goto core_restore_end; + if (recv_cg_set_restore_ack(args->cgroupd_sk)) + goto core_restore_end; + sys_close(args->cgroupd_sk); + } + if (restore_thread_common(args)) goto core_restore_end; @@ -634,7 +745,7 @@ long __export_restore_thread(struct thread_restore_args *args) if (restore_seccomp(args)) BUG(); - ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); + ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type, args->ta->uid); ret = ret || restore_dumpable_flag(&args->ta->mm); ret = ret || restore_pdeath_sig(args); if (ret) @@ -1915,7 +2026,7 @@ long __export_restore_task(struct task_restore_args *args) * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ - ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); + ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type, args->uid); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index b3badb6e41..abac5908b7 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -620,17 +620,16 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat pr_info("path: %s\n", file_path); vma_area->e->status |= VMA_AREA_SYSVIPC; } else { - /* Dump shmem dev, hugetlb dev (private and share) mappings the same way as memfd - * when possible. + /* We dump memfd backed mapping, both normal and hugepage anonymous share + * mapping using memfd approach when possible. */ if (is_memfd(st_buf->st_dev) || is_anon_shmem_map(st_buf->st_dev) || - (kdat.has_memfd_hugetlb && is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag))) { + can_dump_with_memfd_hugetlb(st_buf->st_dev, &hugetlb_flag, file_path, vma_area)) { vma_area->e->status |= VMA_AREA_MEMFD; vma_area->e->flags |= hugetlb_flag; if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } else if (is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag)) { - /* hugetlb mapping but memfd does not support HUGETLB */ vma_area->e->flags |= hugetlb_flag; vma_area->e->flags |= MAP_ANONYMOUS; @@ -1028,12 +1027,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) cr->s.sigpnd = 0; cr->s.shdpnd = 0; + cr->s.sigblk = 0; cr->s.seccomp_mode = SECCOMP_MODE_DISABLED; if (bfdopenr(&f)) return -1; - while (done < 13) { + while (done < 14) { str = breadline(&f); if (str == NULL) break; @@ -1144,13 +1144,23 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) goto err_parse; cr->s.sigpnd |= sigpnd; + done++; + continue; + } + if (!strncmp(str, "SigBlk:", 7)) { + unsigned long long sigblk = 0; + + if (sscanf(str + 7, "%llx", &sigblk) != 1) + goto err_parse; + cr->s.sigblk |= sigblk; + done++; continue; } } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 11 : 10); + expected_done = (parsed_seccomp ? 12 : 11); if (kdat.has_nspid) expected_done++; if (done == expected_done) @@ -2539,7 +2549,8 @@ int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n) return -1; } -int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) +int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *retl, + unsigned int *n) { FILE *f; int ret; @@ -2547,7 +2558,7 @@ int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct li unsigned int n_internal = 0; struct cg_ctl *intern, *ext; - f = fopen_proc(pid, "cgroup"); + f = fopen_proc(pid, "task/%d/cgroup", tid); if (!f) return -1; diff --git a/criu/pstree.c b/criu/pstree.c index f4d77b3a49..72c4a3502a 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -222,6 +222,7 @@ struct pstree_item *__alloc_pstree_item(bool rst) item->pid->ns[0].virt = -1; item->pid->real = -1; item->pid->state = TASK_UNDEF; + item->pid->stop_signo = -1; item->born_sid = -1; item->pid->item = item; futex_init(&item->task_st); diff --git a/criu/seize.c b/criu/seize.c index 58564ca746..f2af12a0bd 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -535,8 +535,10 @@ static int freeze_processes(void) } err: - if (exit_code == 0 || origin_freezer_state == THAWED) - exit_code = freezer_write_state(fd, THAWED); + if (exit_code == 0 || origin_freezer_state == THAWED) { + if (freezer_write_state(fd, THAWED)) + exit_code = -1; + } if (close(fd)) { pr_perror("Unable to thaw tasks"); @@ -615,6 +617,9 @@ static int collect_children(struct pstree_item *item) else processes_to_wait--; + if (ret == TASK_STOPPED) + c->pid->stop_signo = compel_parse_stop_signo(pid); + c->pid->real = pid; c->parent = item; c->pid->state = ret; @@ -646,7 +651,7 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) * the item->state is the state task was in when we seized one. */ - compel_resume_task(item->pid->real, item->pid->state, st); + compel_resume_task_sig(item->pid->real, item->pid->state, st, item->pid->stop_signo); if (st == TASK_DEAD) return; @@ -950,6 +955,9 @@ int collect_pstree(void) else processes_to_wait--; + if (ret == TASK_STOPPED) + root_item->pid->stop_signo = compel_parse_stop_signo(pid); + pr_info("Seized task %d, state %d\n", pid, ret); root_item->pid->state = ret; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index c6021bc1f5..873360bfad 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -497,9 +497,34 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) goto err; } + if (sk->wqlen != 0) { + /* + * There's no known way to get data out of the write + * queue of an icon socket. The only good solution for + * now is to fail the migration. + */ + pr_err("Non-empty write queue on an in-flight socket %#x\n", ue->ino); + goto err; + } + ue->peer = e->sk_desc->sd.ino; pr_debug("\t\tFixed inflight socket %u peer %u)\n", ue->ino, ue->peer); + } else if (ue->state == TCP_LISTEN) { + int i; + + for (i = 0; i < sk->nr_icons; i++) + if (sk->icons[i] == 0) { + /* + * Inode of an icon socket equal to 0 means + * it's already been closed. That means we have + * no simple way to check if it sent any data. + * The only good solution for now is to fail + * the migration. + */ + pr_err("Found a closed in-flight socket to %#x\n", ue->ino); + goto err; + } } dump: if (dump_socket_opts(lfd, skopts)) @@ -1021,8 +1046,8 @@ static struct unix_sk_info *find_queuer_for(int id) struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { - if (ui->queuer && ui->queuer->ue->id == id) - return ui; + if (ui->queuer && ui->ue->id == id) + return ui->queuer; } return NULL; diff --git a/criu/sockets.c b/criu/sockets.c index db772707b6..d17e0a9869 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -29,6 +29,7 @@ #include "pstree.h" #include "util.h" #include "fdstore.h" +#include "cr_options.h" #undef LOG_PREFIX #define LOG_PREFIX "sockets: " @@ -465,18 +466,33 @@ int do_restore_opt(int sk, int level, int name, void *val, int len) return 0; } -static int sk_setbufs(void *arg, int fd, pid_t pid) +int sk_setbufs(int sk, uint32_t *bufs) { - u32 *buf = (u32 *)arg; + uint32_t sndbuf = bufs[0], rcvbuf = bufs[1]; - if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0])) - return -1; - if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1])) - return -1; + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &rcvbuf, sizeof(rcvbuf))) { + if (opts.unprivileged) { + pr_info("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE, falling back to SO_SNDBUF/SO_RCVBUF\n"); + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf))) { + pr_perror("Unable to set socket SO_SNDBUF/SO_RCVBUF"); + return -1; + } + } else { + pr_perror("Unable to set socket SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + return -1; + } + } return 0; } +static int sk_setbufs_ns(void *arg, int fd, pid_t pid) +{ + return sk_setbufs(fd, (uint32_t *)arg); +} + /* * Set sizes of buffers to maximum and prevent blocking * Caller of this fn should call other socket restoring @@ -489,7 +505,7 @@ int restore_prepare_socket(int sk) /* In kernel a bufsize has type int and a value is doubled. */ u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 }; - if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk)) + if (userns_call(sk_setbufs_ns, 0, maxbuf, sizeof(maxbuf), sk)) return -1; /* Prevent blocking on restore */ @@ -517,7 +533,7 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ - ret |= userns_call(sk_setbufs, 0, bufs, sizeof(bufs), sk); + ret |= userns_call(sk_setbufs_ns, 0, bufs, sizeof(bufs), sk); if (soe->has_so_buf_lock) { pr_debug("\trestore buf_lock %d for socket\n", soe->so_buf_lock); @@ -631,8 +647,12 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); soe->has_so_rcvlowat = true; ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); - soe->has_so_mark = true; + /* + * Restoring SO_MARK requires root or CAP_NET_ADMIN. Avoid saving it + * in unprivileged mode if still has its default value. + */ ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); + soe->has_so_mark = !!soe->so_mark; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; diff --git a/criu/sysctl.c b/criu/sysctl.c index b06688712f..99026acf45 100644 --- a/criu/sysctl.c +++ b/criu/sysctl.c @@ -203,6 +203,17 @@ static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) * 2. forks a task * 3. setns()es to the UTS/IPC namespace of the caller * 4. write()s to the files and exits + * + * For the IPC namespace, since + * https://github.com/torvalds/linux/commit/5563cabdde, user with + * enough capability can open IPC sysctl files and write to it. Later + * commit https://github.com/torvalds/linux/commit/1f5c135ee5 and + * https://github.com/torvalds/linux/commit/0889f44e28 bind the IPC + * namespace at the open() time so the changed value does not depend + * on the IPC namespace at the write() time. Also, the permission check + * changes a little bit which makes the above approach unusable but we + * can simply use nonuserns version for restoring as IPC sysctl as the + * restored process currently has enough capability. */ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY); if (dir < 0) { @@ -335,9 +346,12 @@ static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) return ret; } -static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) +/* exit_code = 1 in case nonuserns failed but we want to fallback to userns approach */ +static int __nonuserns_sysctl_op(struct sysctl_req **orig_req, size_t *orig_nr_req, int op) { int ret, exit_code = -1; + struct sysctl_req *req = *orig_req; + size_t nr_req = *orig_nr_req; while (nr_req--) { int fd; @@ -351,6 +365,14 @@ static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) req++; continue; } + if (errno == EACCES && (req->flags & CTL_FLAGS_IPC_EACCES_SKIP)) { + /* The remaining requests are restored using userns approach */ + *orig_req = req; + *orig_nr_req = nr_req + 1; + exit_code = 1; + goto out; + } + pr_perror("Can't open sysctl %s", req->name); goto out; } @@ -404,7 +426,16 @@ int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns) * so we can do those in process as well. */ if (!ns || ns & CLONE_NEWNET || op == CTL_READ) - return __nonuserns_sysctl_op(req, nr_req, op); + return __nonuserns_sysctl_op(&req, &nr_req, op); + + /* Try to use nonuserns for restoring IPC sysctl and fallback to + * userns approach when the returned code is 1. + */ + if (ns & CLONE_NEWIPC && op == CTL_WRITE) { + ret = __nonuserns_sysctl_op(&req, &nr_req, op); + if (ret <= 0) + return ret; + } /* * In order to avoid lots of opening of /proc/sys for each struct sysctl_req, diff --git a/criu/timens.c b/criu/timens.c index 5803fc3594..66c0c02a42 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -5,6 +5,7 @@ #include "proc_parse.h" #include "namespaces.h" #include "timens.h" +#include "cr_options.h" #include "protobuf.h" #include "images/timens.pb-c.h" @@ -57,6 +58,9 @@ int prepare_timens(int id) struct timespec ts; struct timespec prev_moff = {}, prev_boff = {}; + if (opts.unprivileged) + return 0; + img = open_image(CR_FD_TIMENS, O_RSTR, id); if (!img) return -1; diff --git a/criu/unittest/mock.c b/criu/unittest/mock.c index 0151873dc1..e517720e42 100644 --- a/criu/unittest/mock.c +++ b/criu/unittest/mock.c @@ -103,8 +103,7 @@ void set_cr_errno(int new_err) { } -struct ns_desc { -}; +struct ns_desc {}; struct ns_desc user_ns_desc; int switch_ns(int pid, struct ns_desc *nd, int *rst) { @@ -118,8 +117,7 @@ int run_scripts(enum script_actions act) } typedef struct VmaEntry VmaEntry; -struct VmaEntry { -}; +struct VmaEntry {}; void vma_entry__init(VmaEntry *message) { } diff --git a/criu/util.c b/criu/util.c index 40b12bace8..959e609388 100644 --- a/criu/util.c +++ b/criu/util.c @@ -40,6 +40,8 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" +#include "syscall.h" +#include "util-caps.h" #include "clone-noasan.h" #include "cr_options.h" @@ -1425,6 +1427,9 @@ void rlimit_unlimit_nofile(void) { struct rlimit new; + if (opts.unprivileged && !has_cap_sys_resource(opts.cap_eff)) + return; + new.rlim_cur = kdat.sysctl_nr_open; new.rlim_max = kdat.sysctl_nr_open; @@ -1871,7 +1876,7 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) uint64_t criu_run_id; -void util_init() +void util_init(void) { struct timespec tp; @@ -2021,6 +2026,10 @@ char *resolve_mountpoint(char *path) char *mp_path, *free_path; bool is_mountpoint; + /* + * The dirname() function may modify the contents of given path, + * so we need a strdup here to preserve path. + */ mp_path = free_path = xstrdup(path); if (!mp_path) return NULL; @@ -2031,7 +2040,7 @@ char *resolve_mountpoint(char *path) * by openat2 RESOLVE_NO_XDEV, let's just assume they are. */ if (is_same_path(mp_path, "/")) - return mp_path; + goto out; if (path_is_mountpoint(mp_path, &is_mountpoint) == -1) { xfree(free_path); @@ -2039,7 +2048,7 @@ char *resolve_mountpoint(char *path) } if (is_mountpoint) - return mp_path; + goto out; /* Try parent directory */ mp_path = dirname(mp_path); @@ -2048,4 +2057,32 @@ char *resolve_mountpoint(char *path) /* never get here */ xfree(free_path); return NULL; +out: + /* + * The dirname() function may or may not return statically allocated + * strings, so here mp_path can be either dynamically allocated or + * statically allocated. Let's strdup to make the return pointer + * always freeable. + */ + mp_path = xstrdup(mp_path); + xfree(free_path); + return mp_path; +} + +int set_opts_cap_eff(void) +{ + struct __user_cap_header_struct cap_header; + struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3]; + int i; + + cap_header.version = _LINUX_CAPABILITY_VERSION_3; + cap_header.pid = getpid(); + + if (capget(&cap_header, &cap_data[0])) + return -1; + + for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++) + memcpy(&opts.cap_eff[i], &cap_data[i].effective, sizeof(u32)); + + return 0; } diff --git a/criu/vdso.c b/criu/vdso.c index 1a51f1451d..7de2fae784 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -479,7 +479,7 @@ static int vdso_mmap_compat(struct vdso_maps *native, struct vdso_maps *compat, return ret; } -#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 2) +#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 4) static int vdso_fill_compat_symtable(struct vdso_maps *native, struct vdso_maps *compat) { void *vdso_mmap; diff --git a/flog/Makefile b/flog/Makefile new file mode 100644 index 0000000000..12255af719 --- /dev/null +++ b/flog/Makefile @@ -0,0 +1,29 @@ +OPTS=-ggdb3 -Wall -Werror +export OPTS + +CFLAGS += -iquote include +CFLAGS += -iquote flog/include +CFLAGS += -iquote flog/include/uapi + +include $(__nmk_dir)msg.mk + +$(eval $(call gen-built-in,src)) + +flog: + $(Q) $(MAKE) $(build)=$(obj)/src all +.PHONY: flog + +clean-flog: + $(call msg-gen, $@) + $(Q) $(MAKE) $(build)=$(obj)/src clean + $(Q) $(RM) built-in.o +.PHONY: clean-flog + +clean: clean-flog +mrproper: clean + +test: + ./tests/test00 + +all-y += flog + diff --git a/flog/built-in.S b/flog/built-in.S new file mode 100644 index 0000000000..26627d0544 --- /dev/null +++ b/flog/built-in.S @@ -0,0 +1,4 @@ +SECTIONS +{ + .rodata : { _rodata_start = . ; *(.rodata*) ; _rodata_end = . ;} +} diff --git a/flog/include/compiler.h b/flog/include/compiler.h new file mode 100644 index 0000000000..80264ec631 --- /dev/null +++ b/flog/include/compiler.h @@ -0,0 +1,77 @@ +#ifndef __COMPILER_H__ +#define __COMPILER_H__ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +#define __section(S) __attribute__((__section__(#S))) + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef always_inline +#define always_inline __always_inline +#endif + +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +#define __aligned(x) __attribute__((aligned(x))) + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER) +#endif + +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) + +#define min(x, y) \ + ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void)(&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ + }) + +#define max(x, y) \ + ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void)(&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; \ + }) + +#define is_log2(v) (((v) & ((v)-1)) == 0) + +#endif /* __COMPILER_H__ */ diff --git a/flog/include/flog.h b/flog/include/flog.h new file mode 100644 index 0000000000..f00c20541f --- /dev/null +++ b/flog/include/flog.h @@ -0,0 +1,9 @@ +#ifndef __FLOG_H__ +#define __FLOG_H__ + +#include +#include + +#include "uapi/flog.h" + +#endif /* __FLOG_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h new file mode 100644 index 0000000000..8aafe44b75 --- /dev/null +++ b/flog/include/log.h @@ -0,0 +1,17 @@ +#ifndef __LOG_H__ +#define __LOG_H__ + +#include + +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) + +#if 1 +#define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#else +#define pr_debug(fmt, ...) +#endif + +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) + +#endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h new file mode 100644 index 0000000000..07c992968b --- /dev/null +++ b/flog/include/types.h @@ -0,0 +1,16 @@ +#ifndef __FLOG_TYPES_H__ +#define __FLOG_TYPES_H__ + +#include +#include + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h new file mode 100644 index 0000000000..6061f4556a --- /dev/null +++ b/flog/include/uapi/flog.h @@ -0,0 +1,139 @@ +#ifndef __UAPI_FLOG_H__ +#define __UAPI_FLOG_H__ + +#include +#include +#include + +/* + * We work with up to 32 arguments in macros here. + * If more provided -- behaviour is undefined. + */ + +/* + * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s + */ +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) + +#define FLOG_PP_ARG_N(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, \ + _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, N, ...) \ + N + +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, \ + 2, 1, 0 + +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) + +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 + +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) + +#define flog_genbit(ord, n, v, ...) \ + _Generic((v), \ + \ + /* Basic types */ \ + char: 0, \ + signed char: 0, \ + unsigned char: 0, \ + signed short int: 0, \ + unsigned short int: 0, \ + signed int: 0, \ + unsigned int: 0, \ + signed long: 0, \ + unsigned long: 0, \ + signed long long: 0, \ + unsigned long long: 0, \ + \ + /* Not used for a while */ \ + /* float: 12, */ \ + /* double: 13, */ \ + /* long double: 14, */ \ + \ + /* Basic poniters */ \ + char *: (1u << (ord - n - 1)), \ + signed char *: (1u << (ord - n - 1)), \ + unsigned char *: (1u << (ord - n - 1)), \ + signed short int *: 0, \ + unsigned short int *: 0, \ + signed int *: 0, \ + unsigned int *: 0, \ + signed long *: 0, \ + unsigned long *: 0, \ + signed long long *: 0, \ + unsigned long long *: 0, \ + void *: 0, \ + \ + /* Const basic pointers */ \ + const char *: (1u << (ord - n - 1)), \ + const signed char *: (1u << (ord - n - 1)), \ + const unsigned char *: (1u << (ord - n - 1)), \ + const signed short int *: 0, \ + const unsigned short int *: 0, \ + const signed int *: 0, \ + const unsigned int *: 0, \ + const signed long *: 0, \ + const unsigned long *: 0, \ + const signed long long *: 0, \ + const unsigned long long *: 0, \ + const void *: 0, \ + \ + /* Systypes and pointers */ \ + default: -1) + +typedef struct { + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; +} flog_msg_t; + +extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); +void flog_decode_msg(int fdout, const char *format, ...); +extern int flog_decode_all(int fdin, int fdout); + +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) + +int flog_map_buf(int fdout); +int flog_close(int fdout); + +#endif /* __UAPI_FLOG_H__ */ diff --git a/flog/include/util.h b/flog/include/util.h new file mode 100644 index 0000000000..7b1edb6885 --- /dev/null +++ b/flog/include/util.h @@ -0,0 +1,41 @@ +#ifndef __UTIL_H__ +#define __UTIL_H__ + +#include +#include + +#include "log.h" +#include "types.h" + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) \ + do { \ + if (p) \ + free(p); \ + } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +#endif /* __UTIL_H__ */ diff --git a/flog/src/Makefile b/flog/src/Makefile new file mode 100644 index 0000000000..ee73ea7252 --- /dev/null +++ b/flog/src/Makefile @@ -0,0 +1,5 @@ +ccflags-y += -DCONFIG_X86_64 -iquote ./include $(OPTS) +ldflags-y += -r + +#obj-y += main.o +obj-y += flog.o diff --git a/flog/src/flog.c b/flog/src/flog.c new file mode 100644 index 0000000000..d7660f18d8 --- /dev/null +++ b/flog/src/flog.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include + +//#include + +#include "uapi/flog.h" +#include "util.h" + +#define MAGIC 0xABCDABCD + +#define BUF_SIZE (1 << 20) +static char _mbuf[BUF_SIZE]; +static char *mbuf = _mbuf; +static char *fbuf; +static uint64_t fsize; +static uint64_t mbuf_size = sizeof(_mbuf); + +/*int flog_decode_all(int fdin, int fdout) +{ + flog_msg_t *m = (void *)mbuf; + ffi_type *args[34] = { + [0] = &ffi_type_sint, + [1] = &ffi_type_pointer, + [2 ... 33] = &ffi_type_slong + }; + void *values[34]; + ffi_cif cif; + ffi_arg rc; + size_t i, ret; + char *fmt; + + values[0] = (void *)&fdout; + + while (1) { + ret = read(fdin, mbuf, sizeof(m)); + if (ret == 0) + break; + if (ret < 0) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + if (m->magic != MAGIC) { + fprintf(stderr, "The log file was not properly closed\n"); + break; + } + ret = m->size - sizeof(m); + if (m->size > mbuf_size) { + fprintf(stderr, "The buffer is too small"); + return -1; + } + if (read(fdin, mbuf + sizeof(m), ret) != ret) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + + fmt = mbuf + m->fmt; + values[1] = &fmt; + + for (i = 0; i < m->nargs; i++) { + values[i + 2] = (void *)&m->args[i]; + if (m->mask & (1u << i)) { + m->args[i] = (long)(mbuf + m->args[i]); + } + } + + if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, m->nargs + 2, + &ffi_type_sint, args) == FFI_OK) + ffi_call(&cif, FFI_FN(dprintf), &rc, values); + } + return 0; +}*/ + +static int flog_enqueue(flog_msg_t *m) +{ + if (write(1, m, m->size) != m->size) { + fprintf(stderr, "Unable to write a message\n"); + return -1; + } + return 0; +} + +/*extern char *rodata_start; +extern char *rodata_end; +*/ +/* Pre-allocate a buffer in a file and map it into memory. */ +int flog_map_buf(int fdout) +{ + uint64_t off = 0; + void *addr; + + /* + * Two buffers are mmapped into memory. A new one is mapped when a first + * one is completely filled. + */ + if (fbuf && (mbuf - fbuf < BUF_SIZE)) + return 0; + + if (fbuf) { + if (munmap(fbuf, BUF_SIZE * 2)) { + fprintf(stderr, "Unable to unmap a buffer: %m"); + return -1; + } + off = mbuf - fbuf - BUF_SIZE; + fbuf = NULL; + } + + if (fsize == 0) + fsize += BUF_SIZE; + fsize += BUF_SIZE; + + if (ftruncate(fdout, fsize)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + + if (!fbuf) + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, fdout, + fsize - 2 * BUF_SIZE); + else + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, BUF_SIZE * 2, MREMAP_FIXED, fbuf); + if (addr == MAP_FAILED) { + fprintf(stderr, "Unable to map a buffer: %m"); + return -1; + } + + fbuf = addr; + mbuf = fbuf + off; + mbuf_size = 2 * BUF_SIZE; + + return 0; +} + +int flog_close(int fdout) +{ + if (mbuf == _mbuf) + return 0; + + munmap(fbuf, BUF_SIZE * 2); + + if (ftruncate(fdout, fsize - 2 * BUF_SIZE + mbuf - fbuf)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + return 0; +} + +int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...) +{ + flog_msg_t *m; + va_list argptr; + char *str_start, *p; + size_t i; + + if (mbuf != _mbuf && flog_map_buf(fdout)) + return -1; + + m = (void *)mbuf; + + m->nargs = nargs; + m->mask = mask; + + str_start = (void *)m->args + sizeof(m->args[0]) * nargs; + p = memccpy(str_start, format, 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->fmt = str_start - mbuf; + str_start = p; + + va_start(argptr, format); + for (i = 0; i < nargs; i++) { + m->args[i] = (long)va_arg(argptr, long); + /* + * If we got a string, we should either + * reference it when in rodata, or make + * a copy (FIXME implement rodata refs). + */ + if (mask & (1u << i)) { + p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + va_end(argptr); + return -1; + } + m->args[i] = str_start - mbuf; + str_start = p; + } + } + va_end(argptr); + m->size = str_start - mbuf; + + /* + * A magic is required to know where we stop writing into a log file, + * if it was not properly closed. The file is mapped into memory, so a + * space in the file is allocated in advance and at the end it can have + * some unused tail. + */ + m->magic = MAGIC; + + m->size = roundup(m->size, 8); + if (mbuf == _mbuf) { + if (flog_enqueue(m)) + return -1; + } else { + mbuf += m->size; + mbuf_size -= m->size; + } + return 0; +} diff --git a/flog/src/main.c b/flog/src/main.c new file mode 100644 index 0000000000..e027917c68 --- /dev/null +++ b/flog/src/main.c @@ -0,0 +1,158 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "flog.h" + +extern char _rodata_start, _rodata_end; +char *rodata_start = &_rodata_start; +char *rodata_end = &_rodata_end; + +enum { + MODE_BINARY, + MODE_FPRINTF, + MODE_SPRINTF, + MODE_DPRINTF, +}; + +int main(int argc, char *argv[]) +{ + static const char str1[] = "String1 String1"; + static const char str2[] = "string2 string2 string2"; + int fdout = STDOUT_FILENO; + bool use_decoder = false; + int mode = MODE_BINARY; + size_t niter = 100; + int opt, idx; + size_t i; + + static const char short_opts[] = "m:o:di:h"; + static struct option long_opts[] = { + { "mode", required_argument, 0, 'm' }, { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, {}, + }; + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch (opt) { + case 'm': + if (strcmp(optarg, "binary") == 0) { + mode = MODE_BINARY; + } else if (strcmp(optarg, "fprintf") == 0) { + mode = MODE_FPRINTF; + } else if (strcmp(optarg, "sprintf") == 0) { + mode = MODE_SPRINTF; + } else if (strcmp(optarg, "dprintf") == 0) { + mode = MODE_DPRINTF; + } else + goto usage; + break; + case 'o': + if (strcmp(optarg, "stdout") == 0) { + fdout = fileno(stdout); + } else if (strcmp(optarg, "stderr") == 0) { + fdout = fileno(stderr); + } else { + fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fdout < 0) { + fprintf(stderr, "Can't open %s: %s\n", optarg, strerror(errno)); + exit(1); + } + } + break; + case 'i': + niter = atoi(optarg); + break; + case 'd': + use_decoder = true; + break; + case 'h': + default: + goto usage; + } + } + + switch (mode) { + case MODE_BINARY: + if (use_decoder) + return flog_decode_all(STDIN_FILENO, fdout); + + if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) + return 1; + for (i = 0; i < niter; i++) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, + (short)2, (unsigned long)2)) + return 1; + if (flog_close(fdout)) + return 1; + break; + case MODE_DPRINTF: { + for (i = 0; i < niter; i++) { + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + case MODE_FPRINTF: { + FILE *f = fdopen(fdout, "w"); + + for (i = 0; i < niter; i++) { + fprintf(f, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + fflush(f); + } + fclose(f); + break; + } + case MODE_SPRINTF: { + static char buf[4096]; + + for (i = 0; i < niter; i++) { + sprintf(buf, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + default: + return 1; + } + + return 0; +usage: + fprintf(stderr, "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writing results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); + return 1; +} diff --git a/flog/tests/test00 b/flog/tests/test00 new file mode 100755 index 0000000000..a7937e4a18 --- /dev/null +++ b/flog/tests/test00 @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e -x + +echo Map a log file into memory +time ./flog run -i 1000000 -o /tmp/flog.raw.map +echo Write into a log file +time ./flog run -i 1000000 > /tmp/flog.raw +echo Use fprintf +time ./flog run -m fprintf -i 1000000 -o /tmp/flog.fprintf.txt +echo Use dprintf +time ./flog run -m dprintf -i 1000000 -o /tmp/flog.dprintf.txt +echo Use sprintf +time ./flog run -m sprintf -i 1000000 + +time ./flog run -d < /tmp/flog.raw > /tmp/flog.raw.txt +cmp /tmp/flog.raw.txt /tmp/flog.fprintf.txt + +time ./flog run -d < /tmp/flog.raw.map > /tmp/flog.raw.map.txt +cmp /tmp/flog.raw.map.txt /tmp/flog.fprintf.txt + +cmp /tmp/flog.dprintf.txt /tmp/flog.fprintf.txt diff --git a/images/cgroup.proto b/images/cgroup.proto index ee03541240..5c7d16c6d0 100644 --- a/images/cgroup.proto +++ b/images/cgroup.proto @@ -24,6 +24,7 @@ message cgroup_dir_entry { message cg_controller_entry { repeated string cnames = 1; repeated cgroup_dir_entry dirs = 2; + required bool is_threaded = 3; } message cg_member_entry { diff --git a/images/core-mips.proto b/images/core-mips.proto old mode 100755 new mode 100644 diff --git a/images/core.proto b/images/core.proto index 35079f366f..bc8b7a4885 100644 --- a/images/core.proto +++ b/images/core.proto @@ -40,6 +40,7 @@ message task_core_entry { optional task_timers_entry timers = 7; optional task_rlimits_entry rlimits = 8; + /* This is deprecated, should be per-thread */ optional uint32 cg_set = 9; optional signal_queue_entry signals_s = 10; @@ -60,6 +61,8 @@ message task_core_entry { // Reserved for container relative start time //optional uint64 start_time = 19; optional uint64 blk_sigset_extended = 20[(criu).hex = true]; + + optional uint32 stop_signo = 21; } message task_kobj_ids_entry { @@ -103,6 +106,7 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; optional rseq_entry rseq_entry = 15; + required uint32 cg_set = 16; } message task_rlimits_entry { diff --git a/images/rpc.proto b/images/rpc.proto index a6cc5da487..afd2c7b43f 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -138,6 +138,8 @@ message criu_opts { optional string lsm_mount_context = 63; optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; optional bool mntns_compat_mode = 65; + optional bool skip_file_rwx_check = 66; + optional bool unprivileged = 67; /* optional bool check_mounts = 128; */ } diff --git a/lib/c/criu.c b/lib/c/criu.c index 7807d7bc58..fc8159999c 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -555,6 +555,28 @@ void criu_set_shell_job(bool shell_job) criu_local_set_shell_job(global_opts, shell_job); } +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check) +{ + opts->rpc->has_skip_file_rwx_check = true; + opts->rpc->skip_file_rwx_check = skip_file_rwx_check; +} + +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) +{ + criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); +} + +void criu_local_set_unprivileged(criu_opts *opts, bool unprivileged) +{ + opts->rpc->has_unprivileged = true; + opts->rpc->unprivileged = unprivileged; +} + +void criu_set_unprivileged(bool unprivileged) +{ + criu_local_set_unprivileged(global_opts, unprivileged); +} + void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; diff --git a/lib/c/criu.h b/lib/c/criu.h index 7cc6a199c2..28a083d88d 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -78,6 +78,8 @@ void criu_set_tcp_close(bool tcp_close); void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); +void criu_set_unprivileged(bool unprivileged); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); @@ -238,6 +240,7 @@ void criu_local_set_tcp_close(criu_opts *opts, bool tcp_close); void criu_local_set_weak_sysctls(criu_opts *opts, bool val); void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices); void criu_local_set_shell_job(criu_opts *opts, bool shell_job); +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check); void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master); void criu_local_set_file_locks(criu_opts *opts, bool file_locks); void criu_local_set_track_mem(criu_opts *opts, bool track_mem); diff --git a/plugins/amdgpu/.gitignore b/plugins/amdgpu/.gitignore new file mode 100644 index 0000000000..4e5c8f58e1 --- /dev/null +++ b/plugins/amdgpu/.gitignore @@ -0,0 +1,3 @@ +*.pb-c.c +*.pb-c.h +test_topology_remap diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 84b9f87147..64a923d388 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -2,7 +2,7 @@ PLUGIN_NAME := amdgpu_plugin PLUGIN_SOBJ := amdgpu_plugin.so -PLUGIN_INCLUDE := -iquote../../../criu/include +PLUGIN_INCLUDE := -iquote../../include PLUGIN_INCLUDE += -iquote../../criu/include PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ PLUGIN_INCLUDE += -iquote../../ @@ -12,10 +12,11 @@ LIBDRM_INC := -I/usr/include/libdrm DEPS_OK := amdgpu_plugin.so amdgpu_plugin_test DEPS_NOK := ; +__nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk CC := gcc -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) @@ -50,16 +51,16 @@ clean: amdgpu_plugin_clean amdgpu_plugin_test_clean mrproper: clean install: - $(Q) mkdir -p $(PLUGINDIR) ifeq ($(CONFIG_AMDGPU),y) + $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(PLUGINDIR) + $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) endif .PHONY: install uninstall: ifeq ($(CONFIG_AMDGPU),y) $(E) " UNINSTALL" $(PLUGIN_NAME) - $(Q) $(RM) $(PLUGINDIR)/$(PLUGIN_SOBJ) + $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) endif .PHONY: uninstall diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index e48c8988b2..0a55e34a2b 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -873,7 +873,7 @@ void *dump_bo_contents(void *_thread_data) } max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; + SDMA_LINEAR_COPY_MAX_SIZE - 1; for (i = 0; i < thread_data->num_of_bos; i++) { if (bo_buckets[i].gpu_id == thread_data->gpu_id && @@ -967,7 +967,7 @@ void *restore_bo_contents(void *_thread_data) } max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; + SDMA_LINEAR_COPY_MAX_SIZE - 1; snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, false, &image_size); diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index cab72e8a18..eced46c22c 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -45,7 +45,4 @@ RUN adduser -u 1000 -D test RUN pip3 install junit_xml -# For zdtm we need an unversioned python binary -RUN ln -s /usr/bin/python3 /usr/bin/python - RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.openj9-alpine b/scripts/build/Dockerfile.hotspot-alpine similarity index 69% rename from scripts/build/Dockerfile.openj9-alpine rename to scripts/build/Dockerfile.hotspot-alpine index f92011283c..cb9332fd0c 100644 --- a/scripts/build/Dockerfile.openj9-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,6 +1,4 @@ -# FIXME: Replace with eclipse-temurin once Alpine support has been added. -# https://github.com/adoptium/containers/pull/60 -FROM adoptopenjdk/openjdk8-openj9:alpine +FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc RUN apk update && apk add \ @@ -29,4 +27,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu new file mode 100644 index 0000000000..350102818b --- /dev/null +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -0,0 +1,34 @@ +FROM docker.io/library/eclipse-temurin:11-focal +ARG CC=gcc + +COPY scripts/ci/apt-install /bin/apt-install + +RUN apt-install protobuf-c-compiler \ + libprotobuf-c-dev \ + libaio-dev \ + python3-future \ + libprotobuf-dev \ + protobuf-compiler \ + libcap-dev \ + libnl-3-dev \ + gdb \ + bash \ + python3-protobuf \ + python3-yaml \ + libnet-dev \ + libnl-route-3-dev \ + libbsd-dev \ + make \ + git \ + pkg-config \ + iptables \ + gcc \ + maven + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && make -j $(nproc) CC="$CC" + +ENTRYPOINT mvn -q -f test/javaTests/pom.xml test + diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 8936adf815..23db14e8df 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:8-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal ARG CC=gcc COPY scripts/ci/apt-install /bin/apt-install @@ -31,4 +31,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 120f561e48..30dd9ebeb8 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -85,8 +85,8 @@ podman-test: # overlayfs behaves differently on Ubuntu and breaks CRIU # https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 # Switch to devicemapper -openj9-test: restart-docker - ./openj9-test.sh +java-test: restart-docker + ./java-test.sh setup-vagrant: ./vagrant.sh setup @@ -97,7 +97,10 @@ vagrant-fedora-no-vdso: setup-vagrant vagrant-fedora-rawhide: setup-vagrant ./vagrant.sh fedora-rawhide -.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide +vagrant-fedora-non-root: setup-vagrant + ./vagrant.sh fedora-non-root + +.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/apt-install b/scripts/ci/apt-install index 5a790901aa..45aca13f40 100755 --- a/scripts/ci/apt-install +++ b/scripts/ci/apt-install @@ -15,8 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - # shellcheck disable=SC2068 - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends $@ && break + apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index 8113b9b195..deeeca0b9d 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -1,7 +1,5 @@ #!/bin/bash -# shellcheck disable=2044 - set -x cat /proc/self/mountinfo @@ -13,7 +11,8 @@ chmod 0777 test/zdtm/static ./test/zdtm.py run -a --keep-going -k always --parallel 4 -x zdtm/static/rtc "$@" ret=$? -for i in $(find / -name 'asan.log*'); do +shopt -s globstar nullglob +for i in /**/asan.log*; do echo "$i" echo ======================================== cat "$i" diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index f36b4e4581..beb7da6da6 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -1,7 +1,5 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2015 - set -x -e -o pipefail ./apt-install \ @@ -19,29 +17,17 @@ add-apt-repository \ ./apt-install docker-ce +# shellcheck source=/dev/null . /etc/lsb-release -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use devicemapper storage drive as a work-around -echo '{ "experimental": true, "storage-driver": "devicemapper" }' > /etc/docker/daemon.json +# docker checkpoint and restore is an experimental feature +echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf -service docker stop -systemctl stop containerd.service - -# Always use the latest containerd release. -# Restore with containerd versions after v1.2.14 and before v1.5.0-beta.0 are broken. -# https://github.com/checkpoint-restore/criu/issues/1223 -CONTAINERD_DOWNLOAD_URL=$(curl -s https://api.github.com/repos/containerd/containerd/releases/latest | grep '"browser_download_url":.*/containerd-.*-linux-amd64.tar.gz.$' | cut -d\" -f4) -wget -nv "$CONTAINERD_DOWNLOAD_URL" -O - | tar -xz -C /usr/ - -systemctl restart containerd.service -service docker restart - export SKIP_CI_TEST=1 ./run-ci-tests.sh @@ -88,17 +74,37 @@ checkpoint_container () { docker wait cr } -restore_container () { - CHECKPOINT_NAME=$1 - - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { +print_logs () { cat "$(grep log 'log file:' | sed 's/log file:\s*//')" || true docker logs cr || true cat $CRIU_LOG || true dmesg docker ps exit 1 - } +} + +declare -i max_restore_container_tries=3 +current_iteration= + +restore_container () { + CHECKPOINT_NAME=$1 + + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { + # FIXME: There is a race condition in docker/containerd that causes + # docker to occasionally fail when starting a container from a + # checkpoint immediately after the checkpoint has been created. + # https://github.com/moby/moby/issues/42900 + if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then + print_logs + fi + grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && { + ((current_iteration+=1)) + echo "Retry container restore: $current_iteration" + sleep 1; + restore_container "$CHECKPOINT_NAME" + } || + print_logs + } && current_iteration=0 } # Scenario: Create multiple containers and checkpoint and restore them once diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh new file mode 100755 index 0000000000..7cf704f074 --- /dev/null +++ b/scripts/ci/java-test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cd ../.. || exit 1 + +failures="" + +docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . +if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then + failures="$failures openj9-ubuntu" +fi + +docker build -t criu-hotspot-alpine-test:latest -f scripts/build/Dockerfile.hotspot-alpine . +if ! docker run --rm --privileged criu-hotspot-alpine-test:latest; then + failures="$failures hotspot-alpine" +fi + +docker build -t criu-hotspot-ubuntu-test:latest -f scripts/build/Dockerfile.hotspot-ubuntu . +if ! docker run --rm --privileged criu-hotspot-ubuntu-test:latest; then + failures="$failures hotspot-ubuntu" +fi + +if [ -n "$failures" ]; then + echo "Tests failed on $failures" + exit 1 +fi diff --git a/scripts/ci/openj9-test.sh b/scripts/ci/openj9-test.sh deleted file mode 100755 index b8c07f1802..0000000000 --- a/scripts/ci/openj9-test.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -cd ../.. || exit 1 - -failures="" - -docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . -if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then - failures="$failures ubuntu" -fi - -docker build -t criu-openj9-alpine-test:latest -f scripts/build/Dockerfile.openj9-alpine . -if ! docker run --rm --privileged criu-openj9-alpine-test:latest; then - failures="$failures alpine" -fi - -if [ -n "$failures" ]; then - echo "Tests failed on $failures" - exit 1 -fi diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 414004514b..e08fdf3bc5 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -25,13 +25,8 @@ make install popd rm -rf "${tmp_dir}" -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use VFS storage drive as a work-around -export STORAGE_DRIVER=vfs -podman --storage-driver vfs info +podman info -# shellcheck disable=SC2016 podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' sleep 1 diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 81aa072363..7b64c6b066 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,14 +1,17 @@ #!/bin/bash set -x -e -CI_PKGS="protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev +CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml libperl-dev pkg-config python3-future python3-protobuf - python3-junit.xml" + python3-junit.xml) -X86_64_PKGS="gcc-multilib" +X86_64_PKGS=(gcc-multilib) + +# Convert from string to array. +IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) @@ -46,14 +49,14 @@ ci_prep () { else CC=gcc fi - CI_PKGS="$CI_PKGS $CC" + CI_PKGS+=("$CC") # Do not install x86_64 specific packages on other architectures if [ "$UNAME_M" = "x86_64" ]; then - CI_PKGS="$CI_PKGS $X86_64_PKGS" + CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "$CI_PKGS" + scripts/ci/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" # zdtm uses an unversioned python binary to run the tests. @@ -69,9 +72,8 @@ test_stream() { # restorer and eventually close the page read. However, image-streamer expects the # whole image to be read and the image is not reopened, sent twice. These MAP_HUGETLB # test cases will result in EPIPE error at the moment. - STREAM_TEST_EXCLUDE="-x maps09 -x maps10" - # shellcheck disable=SC2086 - ./test/zdtm.py run --stream -p 2 --keep-going -a $STREAM_TEST_EXCLUDE $ZDTM_OPTS + STREAM_TEST_EXCLUDE=(-x maps09 -x maps10) + ./test/zdtm.py run --stream -p 2 --keep-going -a "${STREAM_TEST_EXCLUDE[@]}" "${ZDTM_OPTS[@]}" } print_header() { @@ -142,6 +144,11 @@ time make unittest [ -n "$SKIP_CI_TEST" ] && exit 0 +# Umount cpuset in cgroupv1 to make it move to cgroupv2 +if [ -d /sys/fs/cgroup/cpuset ]; then + umount /sys/fs/cgroup/cpuset +fi + ulimit -c unlimited cgid=$$ @@ -160,21 +167,20 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then # for 32-bit tests. A better way would involve launching docker.. # But it would require making zdtm.py aware of docker and launching # tests inside the CT. - INCOMPATIBLE_LIBS="libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev" - IA32_PKGS="" + INCOMPATIBLE_LIBS=(libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev) + IA32_PKGS=() REFUGE=64-refuge mkdir "$REFUGE" - for i in $INCOMPATIBLE_LIBS ; do + for i in "${INCOMPATIBLE_LIBS[@]}" ; do for j in $(dpkg --listfiles "$i" | grep '\.so$') ; do cp "$j" "$REFUGE/" done - IA32_PKGS="$IA32_PKGS $i:i386" + IA32_PKGS+=("$i:i386") done - # shellcheck disable=SC2086 - apt-get remove $INCOMPATIBLE_LIBS + apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "$IA32_PKGS" + scripts/ci/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi @@ -211,15 +217,12 @@ if [ "${STREAM_TEST}" = "1" ]; then exit 0 fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going "${ZDTM_OPTS[@]}" if criu/criu check --feature move_mount_set_group; then - # shellcheck disable=SC2086 - ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going $ZDTM_OPTS + ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${ZDTM_OPTS[@]}" fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going --criu-config $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going --criu-config "${ZDTM_OPTS[@]}" # Newer kernels are blocking access to userfaultfd: # uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults must be handled without obtaining CAP_SYS_PTRACE capability @@ -227,17 +230,14 @@ if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then echo 1 > /proc/sys/vm/unprivileged_userfaultfd fi -LAZY_EXCLUDE="-x maps04 -x cmdlinenv00 -x maps007" +LAZY_EXCLUDE=(-x maps04 -x cmdlinenv00 -x maps007) LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*' -LAZY_OPTS="-p 2 -T $LAZY_TESTS $LAZY_EXCLUDE $ZDTM_OPTS" +LAZY_OPTS=(-p 2 -T "$LAZY_TESTS" "${LAZY_EXCLUDE[@]}" "${ZDTM_OPTS[@]}") -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls +./test/zdtm.py run "${LAZY_OPTS[@]}" --lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages --tls bash -x ./test/jenkins/criu-fault.sh if [ "$UNAME_M" == "x86_64" ]; then @@ -260,6 +260,7 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling @@ -268,6 +269,7 @@ make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server --dedup +./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --pre-dump-mode read ./test/zdtm.py run -t zdtm/transition/pid_reuse --pre 2 # start time based pid reuse detection ./test/zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2 # pidfd based pid reuse detection diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index af0f7335ad..e23486f29e 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -68,4 +68,16 @@ fedora-rawhide() { ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } +fedora-non-root() { + ssh default uname -a + ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + # Setting the capability should be the only line needed to run as non-root on Fedora + # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu + ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' + # Run it once as non-root + ssh default 'cd /vagrant/criu; criu/criu check --unprivileged; ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' + # Run it as root with '--rootless' + ssh default 'cd /vagrant/criu; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h; sudo chmod 777 test/dump/zdtm/static/{env00,pthread00}; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' +} + $1 diff --git a/scripts/crit-setup.py b/scripts/crit-setup.py index 871e55921e..13df03e3b5 100644 --- a/scripts/crit-setup.py +++ b/scripts/crit-setup.py @@ -1,10 +1,24 @@ +import os from distutils.core import setup +criu_version = "0.0.1" +env = os.environ + +if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: + criu_version = '{}.{}'.format( + env['CRIU_VERSION_MAJOR'], + env['CRIU_VERSION_MINOR'] + ) + + if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: + criu_version += '.' + env['CRIU_VERSION_SUBLEVEL'] + setup(name="crit", - version="0.0.1", + version=criu_version, description="CRiu Image Tool", author="CRIU team", author_email="criu@openvz.org", + license="GPLv2", url="https://github.com/checkpoint-restore/criu", package_dir={'pycriu': 'lib/py'}, packages=["pycriu", "pycriu.images"], diff --git a/scripts/criu-ns b/scripts/criu-ns index 9fc58b6406..d51e7772c0 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -4,6 +4,8 @@ import ctypes.util import errno import sys import os +import fcntl +import termios # constants for unshare CLONE_NEWNS = 0x00020000 @@ -124,6 +126,16 @@ def wrap_restore(): criu_pid = os.fork() if criu_pid == 0: os.setsid() + # Set stdin tty to be a controlling tty of our new session, this is + # required by --shell-job option, as for it CRIU would try to set a + # process group of restored root task to be a foreground group on the + # terminal. + if '--shell-job' in restore_args or '-j' in restore_args: + if os.isatty(sys.stdin.fileno()): + fcntl.ioctl(sys.stdin.fileno(), termios.TIOCSCTTY, 1) + else: + raise OSError(errno.EINVAL, 'The stdin is not a tty for a --shell-job') + _mount_new_proc() run_criu(restore_args) @@ -153,9 +165,9 @@ def _set_namespace(fd): raise OSError(_errno, errno.errorcode[_errno]) -def is_my_namespace(fd): +def is_my_namespace(fd, ns): """Returns True if fd refers to current namespace""" - return os.stat('/proc/self/ns/pid').st_ino != os.fstat(fd).st_ino + return os.stat('/proc/self/ns/%s' % ns).st_ino == os.fstat(fd).st_ino def set_pidns(tpid, pid_idx): @@ -165,7 +177,7 @@ def set_pidns(tpid, pid_idx): pid namespace. """ ns_fd = os.open('/proc/%s/ns/pid' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "pid"): for line in open('/proc/%s/status' % tpid): if not line.startswith('NSpid:'): continue @@ -190,7 +202,7 @@ def set_mntns(tpid): will be the same in target mntns. """ ns_fd = os.open('/proc/%s/ns/mnt' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "mnt"): root_st = os.stat('/') cwd_st = os.stat('.') cwd_path = os.path.realpath('.') diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 592552cb8e..fb5d2ef7ad 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -137,19 +137,6 @@ ENTRY(main) END(main) endef -define FEATURE_TEST_FSCONFIG - -#include - -int main(void) -{ - if (FSCONFIG_CMD_CREATE > 0) - return 0; - return 0; -} - -endef - define FEATURE_TEST_NFTABLES_LIB_API_0 #include @@ -196,3 +183,22 @@ int main(void) return 0; } endef + +define FEATURE_TEST_NO_LIBC_RSEQ_DEFS + +#ifdef __has_include +#if __has_include(\"sys/rseq.h\") +#include +#endif +#endif + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +int main(void) +{ + return 0; +} +endef diff --git a/scripts/protobuf-gen.sh b/scripts/protobuf-gen.sh index 0c738f13a1..25d2feaeb9 100644 --- a/scripts/protobuf-gen.sh +++ b/scripts/protobuf-gen.sh @@ -1,15 +1,15 @@ #!/bin/bash -# shellcheck disable=SC2013,SC1004 - TR="y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/" -for x in $(sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { +sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { /PB_AUTOGEN_ST/d; + /^[ \t]*$/d; s/,.*$//; s/\tPB_//; p; - }' criu/include/protobuf-desc.h); do + }' criu/include/protobuf-desc.h | \ +while IFS= read -r x; do x_la=$(echo "$x" | sed $TR) x_uf=$(echo "$x" | sed -nr 's/^./&#\\\ /; diff --git a/test/Makefile b/test/Makefile index 8416b19619..e8fcffe3fc 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job skip-file-rwx-check other: for t in $(TESTS); do \ diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml index faae44d1bf..ddb6c89cf1 100644 --- a/test/javaTests/pom.xml +++ b/test/javaTests/pom.xml @@ -38,7 +38,7 @@ org.testng testng - 6.3.1 + 7.7.0 diff --git a/test/jenkins/criu-dedup.sh b/test/jenkins/criu-dedup.sh index 842d218bd5..edb1b653d1 100755 --- a/test/jenkins/criu-dedup.sh +++ b/test/jenkins/criu-dedup.sh @@ -4,7 +4,7 @@ set -e source `dirname $0`/criu-lib.sh prep -./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 -x maps09 -x maps10 || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 || fail # Additionally run these tests as they touch a lot of # memory and it makes sense to additionally check it diff --git a/test/jenkins/criu-lazy-migration.sh b/test/jenkins/criu-lazy-migration.sh index b23f31c79d..02a212e0d0 100755 --- a/test/jenkins/criu-lazy-migration.sh +++ b/test/jenkins/criu-lazy-migration.sh @@ -15,7 +15,7 @@ LAZY_MIGRATE_EXCLUDE="-x fifo_loop -x file_locks -x ptrace_sig -x overmount_file --lazy-migrate $LAZY_EXCLUDE $LAZY_MIGRATE_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 -f uns \ diff --git a/test/jenkins/criu-lazy-pages.sh b/test/jenkins/criu-lazy-pages.sh index f629120909..9ef7217391 100755 --- a/test/jenkins/criu-lazy-pages.sh +++ b/test/jenkins/criu-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --lazy-pages $LAZY_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-pre-dump.sh b/test/jenkins/criu-pre-dump.sh index b2972d941f..137f7c23fd 100755 --- a/test/jenkins/criu-pre-dump.sh +++ b/test/jenkins/criu-pre-dump.sh @@ -5,6 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -# FIXME: https://github.com/checkpoint-restore/criu/issues/1868 -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' || fail diff --git a/test/jenkins/criu-remote-lazy-pages.sh b/test/jenkins/criu-remote-lazy-pages.sh index 48787f3f63..1c677e3336 100755 --- a/test/jenkins/criu-remote-lazy-pages.sh +++ b/test/jenkins/criu-remote-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --remote-lazy-pages $LAZY_EXCLUDE -x maps04 || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from "remote" dump with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-snap.sh b/test/jenkins/criu-snap.sh index d8fdf02b3a..b08c57f523 100755 --- a/test/jenkins/criu-snap.sh +++ b/test/jenkins/criu-snap.sh @@ -5,5 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' || fail diff --git a/test/others/config-file/run.sh b/test/others/config-file/run.sh index 92195883e5..26b835b45e 100755 --- a/test/others/config-file/run.sh +++ b/test/others/config-file/run.sh @@ -11,7 +11,7 @@ set -xbm -#shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh if [ ! -d /etc/criu ]; then diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 0d38043d7a..5d13066e70 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -1,11 +1,12 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2002 +# shellcheck disable=SC2002 set -x +# shellcheck source=test/others/env.sh source ../env.sh -images_list="" +images_list=() function gen_imgs { PID=$(../loop) @@ -16,15 +17,15 @@ function gen_imgs { exit 1 fi - images_list=$(ls -1 ./*.img) - if [ -z "$images_list" ]; then + images_list=(./*.img) + if [ "${#images_list[@]}" -eq 0 ]; then echo "Failed to generate images" exit 1 fi } function run_test1 { - for x in $images_list + for x in "${images_list[@]}" do echo "=== $x" if [[ $x == *pages* ]]; then @@ -45,9 +46,7 @@ function run_test1 { function run_test2 { - mapfile -t array <<< "$images_list" - - PROTO_IN=${array[0]} + PROTO_IN="${images_list[0]}" JSON_IN=$(mktemp -p ./ tmp.XXXXXXXXXX.json) OUT=$(mktemp -p ./ tmp.XXXXXXXXXX.log) diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index dd774e298b..9b6e564755 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -1,7 +1,7 @@ #!/bin/bash set -x -# shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh || exit 1 function gen_imgs { diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 77bdfb87eb..f7d363aabe 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -9,7 +9,7 @@ TEST_LOG="${TEST_DIR}/test.log" DUMP_LOG="${TEST_DIR}/dump.log" RESTORE_LOG="${TEST_DIR}/restore.log" -# shellcheck disable=1091 +# shellcheck source=test/others/env.sh source "${MAIN_DIR}/../env.sh" || exit 1 echo "== Clean" diff --git a/test/others/skip-file-rwx-check/Makefile b/test/others/skip-file-rwx-check/Makefile new file mode 100644 index 0000000000..419d592b73 --- /dev/null +++ b/test/others/skip-file-rwx-check/Makefile @@ -0,0 +1,7 @@ +.PHONY: run clean + +run: + ./run.sh + +clean: + rm -rf testfile *.img dump.log restore-expected-fail.log restore.log stats-dump stats-restore diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh new file mode 100755 index 0000000000..0803d78eca --- /dev/null +++ b/test/others/skip-file-rwx-check/run.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +source ../env.sh + +make clean +touch testfile +chmod +w testfile +tail --follow testfile & +tailpid=$! +if ! "$criu" dump --tree=$tailpid --shell-job --verbosity=4 --log-file=dump.log +then + kill $tailpid + echo "Failed to dump process as expected" + echo FAIL + exit 1 +fi +chmod -w testfile +if "$criu" restore --restore-detached --shell-job --verbosity=4 --log-file=restore-expected-fail.log +then + kill $tailpid + echo "Unexpectedly restored process with reference to a file who's r/w/x perms changed when --skip-file-rwx-check option was not used" + echo FAIL + exit 1 +fi +if ! "$criu" restore --skip-file-rwx-check --restore-detached --shell-job --verbosity=4 --log-file=restore.log +then + echo "Failed to restore process with reference to a file who's r/w/x perms changed when --skip-file-rwx-check option was used" + echo FAIL + exit 1 +fi +kill $tailpid +echo PASS diff --git a/test/zdtm.py b/test/zdtm.py index c011c79c0e..a311610c3f 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -41,6 +41,8 @@ prev_line = None uuid = uuid.uuid4() +NON_ROOT_UID = 65534 + def alarm(*args): print("==== ALARM ====") @@ -267,7 +269,7 @@ def __copy_deps(self, deps): def init(self, l_bins, x_bins): subprocess.check_call( - ["mount", "--make-slave", "--bind", ".", self.root]) + ["mount", "--make-private", "--bind", ".", self.root]) self.root_mounted = True if not os.access(self.root + "/.constructed", os.F_OK): @@ -392,10 +394,11 @@ def __init__(self, cr_action): class zdtm_test: - def __init__(self, name, desc, flavor, freezer): + def __init__(self, name, desc, flavor, freezer, rootless): self.__name = name self.__desc = desc self.__freezer = None + self.__rootless = rootless self.__make_action('cleanout') self.__pid = 0 self.__flavor = flavor @@ -439,6 +442,8 @@ def __wait_task_die(self): wait_pid_die(int(self.__pid), self.__name, self.__timeout) def __add_wperms(self): + if os.getuid() != 0: + return # Add write perms for .out and .pid files for b in self._bins: p = os.path.dirname(b) @@ -457,6 +462,9 @@ def start(self): env['ZDTM_NOTIFY_FDIN'] = "100" env['ZDTM_NOTIFY_FDOUT'] = "101" + if self.__rootless: + env['ZDTM_ROOTLESS'] = "1" + if not test_flag(self.__desc, 'suid'): # Numbers should match those in criu env['ZDTM_UID'] = "18943" @@ -618,11 +626,15 @@ def available(): ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) + if opts['rootless']: + return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups", str(uuid)]) @staticmethod def cleanup(): + if opts['rootless']: + return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_umount_cgroups", str(uuid)]) @@ -640,7 +652,9 @@ def load_module_from_file(name, path): class inhfd_test: - def __init__(self, name, desc, flavor, freezer): + def __init__(self, name, desc, flavor, freezer, rootless): + if rootless: + raise test_fail_exc("This kind of test does not currently support rootless mode") self.__name = os.path.basename(name) print("Load %s" % name) self.__fdtyp = load_module_from_file(self.__name, name) @@ -801,8 +815,8 @@ def cleanup(): class groups_test(zdtm_test): - def __init__(self, name, desc, flavor, freezer): - zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer) + def __init__(self, name, desc, flavor, freezer, rootless): + zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer, rootless) if flavor.ns: self.__real_name = name with open(name) as fd: @@ -1039,6 +1053,7 @@ def __init__(self, opts): self.__dedup = bool(opts['dedup']) self.__mdedup = bool(opts['noauto_dedup']) self.__user = bool(opts['user']) + self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) self.__stream = bool(opts['stream']) self.__show_stats = bool(opts['show_stats']) @@ -1138,6 +1153,9 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): print("Run criu " + action) + if self.__rootless: + s_args += ["--unprivileged"] + strace = [] if self.__sat: fname = os.path.join(self.__ddir(), action + '.strace') @@ -1156,7 +1174,10 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): if action == "restore": preexec = None else: - preexec = self.__user and self.set_user_id or None + if os.getuid(): + preexec = None + else: + preexec = self.__user and self.set_user_id or None __ddir = self.__ddir() @@ -1476,10 +1497,11 @@ def check(feature): except Exception: return False - return criu_cli.run( - "check", - ["--no-default-config", "--verbosity=0", "--feature", feature], - opts['criu_bin']) == 0 + args = ["--no-default-config", "-verbosity=0", "--feature", feature] + if opts['rootless']: + args += ["--unprivileged"] + + return criu_cli.run("check", args, opts['criu_bin']) == 0 @staticmethod def available(): @@ -1651,6 +1673,15 @@ def get_visible_state(test): return files, maps, mounts +def has_vsyscall(maps): + vsyscall = u"ffffffffff600000-ffffffffff601000" + for i in maps: + if vsyscall in i: + return i + + return None + + def check_visible_state(test, state, opts): new = get_visible_state(test) @@ -1666,9 +1697,9 @@ def check_visible_state(test, state, opts): new_maps = new[1][pid] if os.getenv("COMPAT_TEST"): # the vsyscall vma isn't unmapped from x32 processes - vsyscall = u"ffffffffff600000-ffffffffff601000 r-xp" - if vsyscall in new_maps and vsyscall not in old_maps: - new_maps.remove(vsyscall) + entry = has_vsyscall(new_maps) + if entry and has_vsyscall(old_maps) is None: + new_maps.remove(entry) if old_maps != new_maps: print("%s: Old maps lost: %s" % (pid, old_maps - new_maps)) print("%s: New maps appeared: %s" % (pid, new_maps - old_maps)) @@ -1891,7 +1922,7 @@ def do_run_test(tname, tdesc, flavs, opts): if opts['dry_run']: continue flav = flavors[f](opts) - t = tclass(tname, tdesc, flav, fcg) + t = tclass(tname, tdesc, flav, fcg, opts['rootless']) cr_api = criu(opts) try: @@ -2042,7 +2073,8 @@ def run_test(self, name, desc, flavor): 'sat', 'script', 'rpc', 'criu_config', 'lazy_pages', 'join_ns', 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', - 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode') + 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', + 'rootless') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2052,6 +2084,9 @@ def run_test(self, name, desc, flavor): logf = None log = None + if opts['rootless'] and os.getuid() == 0: + os.setgid(NON_ROOT_UID) + os.setuid(NON_ROOT_UID) sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], env=dict(os.environ, CR_CT_TEST_INFO=arg), stdout=log, @@ -2591,6 +2626,10 @@ def set_nr_hugepages(nr): with open("/proc/sys/vm/nr_hugepages", "w") as f: f.write("{}\n".format(nr)) return orig_hugepages + except PermissionError as err: + # EACCES is expected when running as non-root, otherwise re-raise the exception. + if err.errno != errno.EACCES or os.getuid() == 0: + raise except OSError as err: if err.errno != errno.EOPNOTSUPP: raise @@ -2664,6 +2703,10 @@ def get_cli_args(): rp.add_argument("--freezecg", help="Use freeze cgroup (path:state)") rp.add_argument("--user", help="Run CRIU as regular user", action='store_true') + rp.add_argument( + "--rootless", + help="Run CRIU rootless (uid!=0) (needs CAP_CHECKPOINT_RESTORE)", + action='store_true') rp.add_argument("--rpc", help="Run CRIU via RPC rather than CLI", action='store_true') diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 3ec58dfaf7..949dc123a7 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -4,7 +4,7 @@ CFLAGS += $(USERCFLAGS) LIB := libzdtmtst.a -LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c file.c PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') diff --git a/test/zdtm/lib/file.c b/test/zdtm/lib/file.c new file mode 100644 index 0000000000..57d85421d3 --- /dev/null +++ b/test/zdtm/lib/file.c @@ -0,0 +1,46 @@ +#include +#include +#include "zdtmtst.h" + +int write_value(const char *path, const char *value) +{ + int fd, l; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + l = write(fd, value, strlen(value)); + if (l < 0) { + pr_perror("failed to write %s to %s", value, path); + close(fd); + return -1; + } + + close(fd); + return 0; +} + +int read_value(const char *path, char *value, int size) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + ret = read(fd, (void *)value, size); + if (ret < 0) { + pr_perror("read %s", path); + close(fd); + return -1; + } + + value[ret] = '\0'; + close(fd); + return 0; +} diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 57eb42046a..6291ea4a7b 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -239,34 +239,37 @@ void test_init(int argc, char **argv) exit(1); } - val = getenv("ZDTM_GROUPS"); - if (val) { - char *tok = NULL; - unsigned int size = 0, groups[NGROUPS_MAX]; - - tok = strtok(val, " "); - while (tok) { - size++; - groups[size - 1] = atoi(tok); - tok = strtok(NULL, " "); + val = getenv("ZDTM_ROOTLESS"); + if (!val) { + val = getenv("ZDTM_GROUPS"); + if (val) { + char *tok = NULL; + unsigned int size = 0, groups[NGROUPS_MAX]; + + tok = strtok(val, " "); + while (tok) { + size++; + groups[size - 1] = atoi(tok); + tok = strtok(NULL, " "); + } + + if (setgroups(size, groups)) { + fprintf(stderr, "Can't set groups: %m"); + exit(1); + } } - if (setgroups(size, groups)) { - fprintf(stderr, "Can't set groups: %m"); + val = getenv("ZDTM_GID"); + if (val && (setgid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); exit(1); } - } - val = getenv("ZDTM_GID"); - if (val && (setgid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); - } - - val = getenv("ZDTM_UID"); - if (val && (setuid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); + val = getenv("ZDTM_UID"); + if (val && (setuid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); + } } if (prctl(PR_SET_DUMPABLE, 1)) { diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index ed7c23ee26..105f3c11a0 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -126,11 +126,25 @@ extern int write_pidfile(int pid); /* message helpers */ extern int test_log_init(const char *outfile, const char *suffix); extern int zdtm_seccomp; -#define pr_err(format, arg...) test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg) -#define pr_perror(format, arg...) \ - test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#define fail(format, arg...) \ - test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#define pr_err(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg); \ + 1; \ + }) + +#define pr_perror(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) + +#define fail(format, arg...) \ + ({ \ + test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) #define skip(format, arg...) test_msg("SKIP: %s:%d: " format "\n", __FILE__, __LINE__, ##arg) #define pass() test_msg("PASS\n") @@ -202,4 +216,7 @@ static inline void cleanup_closep(void *p) TEMP_FAILURE_RETRY(close(*pp)); } +extern int write_value(const char *path, const char *value); +extern int read_value(const char *path, char *value, int size); + #endif /* _VIMITESU_H_ */ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 9dc02d4a58..000488133d 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -184,6 +184,8 @@ TST_NOFILE := \ stopped01 \ stopped02 \ stopped12 \ + stopped03 \ + stopped04 \ rtc \ clean_mntns \ mntns_rw_ro_rw \ @@ -199,6 +201,7 @@ TST_NOFILE := \ scm04 \ scm05 \ scm06 \ + scm09 \ aio00 \ aio01 \ fd \ @@ -303,6 +306,10 @@ TST_FILE = \ ghost_holes00 \ ghost_holes01 \ ghost_holes02 \ + ghost_holes_large00 \ + ghost_holes_large01 \ + ghost_multi_hole00 \ + ghost_multi_hole01 \ unlink_largefile \ mtime_mmap \ fifo \ @@ -347,6 +354,10 @@ TST_FILE = \ socket_close_data01 \ fifo_upon_unix_socket00 \ fifo_upon_unix_socket01 \ + sk-unix-listen01 \ + sk-unix-listen02 \ + sk-unix-listen03 \ + sk-unix-listen04 \ TST_DIR = \ cwd00 \ @@ -377,6 +388,8 @@ TST_DIR = \ cgroup02 \ cgroup03 \ cgroup04 \ + cgroupv2_00 \ + cgroupv2_01 \ cgroup_ifpriomap \ cgroup_ignore \ cgroup_stray \ @@ -402,10 +415,12 @@ TST_DIR = \ mnt_ext_master \ mnt_ext_dev \ mnt_ext_root \ + mnt_root_ext \ mnt_ext_collision \ mntns_pivot_root \ mntns_pivot_root_ro \ mnt_ext_sharing \ + mnt_ext_multiple \ mount_complex_sharing \ mnt_tracefs \ mntns_deleted \ @@ -591,6 +606,7 @@ vdso01: LDLIBS += -lrt scm01: CFLAGS += -DKEEP_SENT_FD scm02: CFLAGS += -DSEND_BOTH scm04: CFLAGS += -DSEPARATE +scm09: CFLAGS += -DCLOSE_SENDER_FD mntns_link_remap: CFLAGS += -DZDTM_LINK_REMAP mntns_shared_bind02: CFLAGS += -DSHARED_BIND02 mntns_root_bind02: CFLAGS += -DROOT_BIND02 @@ -603,6 +619,7 @@ unlink_fstat04: CFLAGS += -DUNLINK_FSTAT04 unlink_fstat041: CFLAGS += -DUNLINK_FSTAT041 -DUNLINK_FSTAT04 ghost_holes01: CFLAGS += -DTAIL_HOLE ghost_holes02: CFLAGS += -DHEAD_HOLE +ghost_holes_large01: CFLAGS += -DLIMIT sk-freebind-false: CFLAGS += -DZDTM_FREEBIND_FALSE selinux02: CFLAGS += -DUSING_SOCKCREATE stopped01: CFLAGS += -DZDTM_STOPPED_KILL @@ -661,6 +678,12 @@ bpf_array: LDLIBS += -lbpf fifo_upon_unix_socket01: CFLAGS += -DFIFO_UPON_UNIX01 +sk-unix-listen02: CFLAGS += -DSK_UNIX_LISTEN02 +sk-unix-listen03: CFLAGS += -DSK_UNIX_LISTEN03 +sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 + +cgroupv2_01: LDLIBS += -pthread + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 5a424be125..8c40ffd6bd 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -19,26 +19,6 @@ char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; -int write_value(const char *path, const char *value) -{ - int fd, l; - - fd = open(path, O_WRONLY); - if (fd < 0) { - pr_perror("open %s", path); - return -1; - } - - l = write(fd, value, strlen(value)); - close(fd); - if (l < 0) { - pr_perror("failed to write %s to %s", value, path); - return -1; - } - - return 0; -} - int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; diff --git a/test/zdtm/static/cgroupv2_00.c b/test/zdtm/static/cgroupv2_00.c new file mode 100644 index 0000000000..2c6780e0ce --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.c @@ -0,0 +1,86 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that some cgroup-v2 properties in kernel controllers are preserved"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg00"; + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + int ret = -1; + + test_init(argc, argv); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + test_daemon(); + test_waitsig(); + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "cpuset\n")) { + fail("cgroup.subtree_control mismatches"); + goto out; + } + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "threaded\n")) { + fail("cgroup.type mismatches"); + goto out; + } + + pass(); + + ret = 0; + +out: + sprintf(path, "%s", dirname); + umount(path); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_00.checkskip b/test/zdtm/static/cgroupv2_00.checkskip new file mode 100755 index 0000000000..375ed35648 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc new file mode 100644 index 0000000000..4bfd4b2656 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_00.hook b/test/zdtm/static/cgroupv2_00.hook new file mode 100755 index 0000000000..1002b1ec54 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.hook @@ -0,0 +1,16 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg00" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" +echo "-cpuset" > "$tname/$cgname/cgroup.subtree_control" + +set +e +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" diff --git a/test/zdtm/static/cgroupv2_01.c b/test/zdtm/static/cgroupv2_01.c new file mode 100644 index 0000000000..f3a6d18baf --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.c @@ -0,0 +1,180 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup-v2 threaded controllers"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg01"; + +task_waiter_t t; + +#define gettid(code) syscall(__NR_gettid) + +void cleanup(void) +{ + char path[1024]; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + rmdir(path); + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + rmdir(path); + sprintf(path, "%s/%s", dirname, cgname); + rmdir(path); + sprintf(path, "%s", dirname); + umount(path); +} + +int is_in_cgroup(char *cgname) +{ + FILE *cgf; + char buffer[1024]; + + sprintf(buffer, "/proc/self/task/%ld/cgroup", gettid()); + cgf = fopen(buffer, "r"); + if (cgf == NULL) { + pr_err("Fail to open thread's cgroup procfs\n"); + return 0; + } + + while (fgets(buffer, sizeof(buffer), cgf)) { + if (strstr(buffer, cgname)) { + fclose(cgf); + return 1; + } + } + + fclose(cgf); + return 0; +} + +void *thread_func(void *arg) +{ + char path[1024], aux[1024]; + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) { + cleanup(); + exit(1); + } + + read_value(path, aux, sizeof(aux)); + + task_waiter_complete(&t, 1); + + /* Wait for restore */ + task_waiter_wait4(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread2"); + if (!is_in_cgroup(path)) { + fail("Thread2's cgroup is not restored"); + cleanup(); + exit(1); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + pthread_t thread2; + int ret = 1; + + test_init(argc, argv); + task_waiter_init(&t); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + ret = pthread_create(&thread2, NULL, thread_func, NULL); + if (ret < 0) { + pr_err("pthread_create %s\n", strerror(ret)); + ret = 1; + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) + goto out; + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + task_waiter_complete(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread1"); + if (!is_in_cgroup(path)) { + fail("Main thread's cgroup is not restored"); + cleanup(); + exit(1); + } + pthread_join(thread2, NULL); + pass(); + + ret = 0; + +out: + cleanup(); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_01.checkskip b/test/zdtm/static/cgroupv2_01.checkskip new file mode 100755 index 0000000000..375ed35648 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc new file mode 100644 index 0000000000..4bfd4b2656 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.hook b/test/zdtm/static/cgroupv2_01.hook new file mode 100755 index 0000000000..2263fd0146 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.hook @@ -0,0 +1,24 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg01" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" + +set +e +rmdir "$tname/$cgname/thread1" + +# When the test finishes, the cleanup() function removes this directory +# successfully because the thread in this controller exit and no other +# threads belong to this controller +if [ "$1" == "--pre-restore" ]; then + rmdir "$tname/$cgname/thread2" +fi + +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" diff --git a/test/zdtm/static/ghost_holes_large00.c b/test/zdtm/static/ghost_holes_large00.c new file mode 100644 index 0000000000..1a9739f8e9 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large00.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with one large hole(1GiB) in the middle"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for data size */ +#ifdef LIMIT +#define BUFSIZE 1024 * 1024 +#else +#define BUFSIZE 4096 +#endif +static unsigned char buf[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define DATA1_OFF 0 +#define HOLE_SIZE (1LL * 1 * 1024 * 1024 * 1024) +#define DATA2_OFF (BUFSIZE + HOLE_SIZE) +#define FILE_SIZE (2 * BUFSIZE + HOLE_SIZE) +#define ST_UNIT 512 + +int main(int argc, char **argv) +{ + int fd; + struct stat st; + uint32_t crc; + bool chk_hole = true; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + pr_perror("can't write data1"); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + pr_perror("can't write data2"); + goto failed; + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + test_msg("Won't check for hole\n"); + chk_hole = false; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("file size OK\n"); + + if (st.st_blocks * ST_UNIT != 2 * BUFSIZE) { + fail("actual file size changed to %ld", (long)st.st_blocks * ST_UNIT); + goto failed; + } + + test_msg("actual file size OK\n"); + + /* Data 1 */ + if (pread(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + fail("pread1 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk1 fail"); + goto failed; + } + + test_msg("Data1 OK\n"); + + /* Data 2 */ + if (pread(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + fail("pread2 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk2 fail"); + goto failed; + } + + test_msg("Data2 OK\n"); + + /* Hole */ + if (chk_hole) { + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + fail("Begin of mid hole not found"); + goto failed; + } + if (lseek(fd, DATA1_OFF + BUFSIZE, SEEK_DATA) != DATA2_OFF) { + fail("End of mid hole not found"); + goto failed; + } + test_msg("Mid hole OK\n"); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_holes_large01.c b/test/zdtm/static/ghost_holes_large01.c new file mode 120000 index 0000000000..1b90363d45 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.c @@ -0,0 +1 @@ +ghost_holes_large00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_holes_large01.desc b/test/zdtm/static/ghost_holes_large01.desc new file mode 100644 index 0000000000..8e6a476bd7 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole00.c b/test/zdtm/static/ghost_multi_hole00.c new file mode 100644 index 0000000000..0f78d4f144 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with a lot of holes(every 8K length contains only 4K data)"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for hole size */ +#define BUFSIZE 4096 +static unsigned char buf4k[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define FILE_SIZE (1 << 23) /* 8Mb */ + +#define FILE_INTERVAL (1 << 13) /* 8Kb */ + +int main(int argc, char **argv) +{ + int fd, off; + struct stat st; + uint32_t crc; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + crc = ~0; + datagen(buf4k, BUFSIZE, &crc); + if (pwrite(fd, &buf4k, BUFSIZE, off) != BUFSIZE) { + perror("pwrite"); + goto failed; + } + + /* + * In some file system, such as xfs, + * only pwrite might not able to create highly sparse file, + * so we need to forcibly allocate hole inside the file. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off + BUFSIZE, BUFSIZE)) { + perror("fallocate"); + goto failed; + } + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("Size %u OK\n", FILE_SIZE); + + /* Data*/ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (pread(fd, buf4k, BUFSIZE, off) != BUFSIZE) { + fail("pread failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + crc = ~0; + if (datachk(buf4k, BUFSIZE, &crc)) { + fail("datachk failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + test_msg("Data @%du OK\n", off / FILE_INTERVAL); + } + + /* Hole */ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (lseek(fd, off, SEEK_HOLE) != off + BUFSIZE) { + fail("failed to find hole @ %u", off / FILE_SIZE); + goto failed; + } + test_msg("Hole @%du OK\n", off / FILE_INTERVAL); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_multi_hole00.desc b/test/zdtm/static/ghost_multi_hole00.desc new file mode 100644 index 0000000000..3981e81804 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --no-ghost-fiemap'} diff --git a/test/zdtm/static/ghost_multi_hole01.c b/test/zdtm/static/ghost_multi_hole01.c new file mode 120000 index 0000000000..c75006a6bf --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.c @@ -0,0 +1 @@ +ghost_multi_hole00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole01.desc b/test/zdtm/static/ghost_multi_hole01.desc new file mode 100644 index 0000000000..d1dc68a54d --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --ghost-fiemap'} diff --git a/test/zdtm/static/inotify_system.c b/test/zdtm/static/inotify_system.c index 141476415a..079d4b1613 100644 --- a/test/zdtm/static/inotify_system.c +++ b/test/zdtm/static/inotify_system.c @@ -57,7 +57,7 @@ const char *inot_dir = "./inotify.no_del"; (MASK == IN_UNMOUNT) ? "IN_UNMOUNT" : \ (MASK == IN_Q_OVERFLOW) ? "IN_Q_OVERFLOW" : \ (MASK == IN_IGNORED) ? "IN_IGNORED" : \ - "UNKNOWN" + "UNKNOWN" #include #include diff --git a/test/zdtm/static/mnt_ext_multiple.c b/test/zdtm/static/mnt_ext_multiple.c new file mode 100644 index 0000000000..7014927ac3 --- /dev/null +++ b/test/zdtm/static/mnt_ext_multiple.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check multiple non-common root external mounts with same external master"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname = "mnt_ext_multiple.test"; +char *source = "zdtm_ext_multiple"; +char *ext_source = "zdtm_ext_multiple.ext"; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char *root, testdir[PATH_MAX]; + char dst_a[PATH_MAX], dst_b[PATH_MAX]; + char src[PATH_MAX], src_a[PATH_MAX], src_b[PATH_MAX]; + char nsdst_a[PATH_MAX], nsdst_b[PATH_MAX]; + char *tmp = "/tmp/zdtm_ext_multiple.tmp"; + char *zdtm_newns = getenv("ZDTM_NEWNS"); + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare directories in test root */ + sprintf(testdir, "%s/%s", root, dirname); + mkdir(testdir, 0755); + sprintf(dst_a, "%s/%s/dst_a", root, dirname); + mkdir(dst_a, 0755); + sprintf(dst_b, "%s/%s/dst_b", root, dirname); + mkdir(dst_b, 0755); + + /* Prepare directories in criu root */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + sprintf(src, "%s/src", tmp); + mkdir(src, 0755); + + /* Create a shared mount in criu mntns */ + if (mount(ext_source, src, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, src, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + if (mount(NULL, src, NULL, MS_SHARED, NULL)) { + pr_perror("make shared"); + return 1; + } + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + /* + * Populate to the tests root subdirectories of the src mount + */ + sprintf(src_a, "%s/src/a", tmp); + mkdir(src_a, 0755); + if (mount(src_a, dst_a, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + sprintf(src_b, "%s/src/b", tmp); + mkdir(src_b, 0755); + if (mount(src_b, dst_b, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + +test: + test_init(argc, argv); + + /* Make "external" mounts to have external master */ + sprintf(nsdst_a, "/%s/dst_a", dirname); + if (mount(NULL, nsdst_a, NULL, MS_SLAVE, NULL)) { + pr_perror("make slave"); + return 1; + } + sprintf(nsdst_b, "/%s/dst_b", dirname); + if (mount(NULL, nsdst_b, NULL, MS_SLAVE, NULL)) { + pr_perror("make slave"); + return 1; + } + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/test/zdtm/static/mnt_ext_multiple.desc b/test/zdtm/static/mnt_ext_multiple.desc new file mode 100644 index 0000000000..fd413ed15c --- /dev/null +++ b/test/zdtm/static/mnt_ext_multiple.desc @@ -0,0 +1,5 @@ +{ 'dopts': '--external mnt[/mnt_ext_multiple.test/dst_a]:MNT_A --external mnt[/mnt_ext_multiple.test/dst_b]:MNT_B', + 'feature': 'mnt_id move_mount_set_group', + 'flavor': 'ns uns', + 'flags': 'suid', + 'ropts': '--external mnt[MNT_A]:/tmp/zdtm_ext_multiple.tmp/src/a --external mnt[MNT_B]:/tmp/zdtm_ext_multiple.tmp/src/b --no-mntns-compat-mode'} diff --git a/test/zdtm/static/mnt_root_ext.c b/test/zdtm/static/mnt_root_ext.c new file mode 100644 index 0000000000..305e872627 --- /dev/null +++ b/test/zdtm/static/mnt_root_ext.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check external mount from host's rootfs"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname = "mnt_root_ext.test"; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char *root, testdir[PATH_MAX], nstestdir[PATH_MAX]; + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char tmp[] = "/.zdtm_root_ext.tmp"; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare directories in test root */ + sprintf(testdir, "%s/%s", root, dirname); + mkdir(testdir, 0755); + + /* Prepare directories in criu root */ + mkdir(tmp, 0755); + + /* Make criu's mntns root mount shared */ + if (mount(NULL, "/", NULL, MS_SHARED, NULL)) { + pr_perror("make shared"); + return 1; + } + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + /* + * Make mounts in temporary mntns slave, to prevent propagation to criu mntns + */ + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { + pr_perror("make rslave"); + return 1; + } + + /* + * Populate to the tests root host's rootfs subdir + */ + if (mount(tmp, testdir, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + /* + * Make "external" mount to be slave + */ + sprintf(nstestdir, "/%s", dirname); + if (mount(NULL, nstestdir, NULL, MS_SLAVE, NULL)) { + pr_perror("make slave"); + return 1; + } + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/test/zdtm/static/mnt_root_ext.desc b/test/zdtm/static/mnt_root_ext.desc new file mode 100644 index 0000000000..fee7efbae2 --- /dev/null +++ b/test/zdtm/static/mnt_root_ext.desc @@ -0,0 +1,5 @@ +{ 'dopts': '--external mnt[/mnt_root_ext.test]:MNT', + 'feature': 'mnt_id move_mount_set_group', + 'flavor': 'ns uns', + 'flags': 'suid', + 'ropts': '--external mnt[MNT]:.zdtm_root_ext.tmp --no-mntns-compat-mode'} diff --git a/test/zdtm/static/mnt_root_ext.hook b/test/zdtm/static/mnt_root_ext.hook new file mode 100755 index 0000000000..a5286f208c --- /dev/null +++ b/test/zdtm/static/mnt_root_ext.hook @@ -0,0 +1,5 @@ +#!/bin/bash + +[ "$1" == "--clean" ] || exit 0 + +rmdir /.zdtm_root_ext.tmp diff --git a/test/zdtm/static/s390x_regs_check.c b/test/zdtm/static/s390x_regs_check.c index 40c480b3f2..82dca0519d 100644 --- a/test/zdtm/static/s390x_regs_check.c +++ b/test/zdtm/static/s390x_regs_check.c @@ -40,13 +40,13 @@ const char *test_author = "Michael Holzheu "; * * - Verify that "criu restore" sets the correct register sets * from "criu dump": - * $ zdtmp.py run -t zdtm/static/s390x_regs_check + * $ zdtm.py run -t zdtm/static/s390x_regs_check * * - Verify that dumpee continues running with correct registers after * parasite injection: - * $ zdtmp.py run --norst -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --norst --pre 2 -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --check-only -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst --pre 2 -t zdtm/static/s390x_regs_check + * $ zdtm.py run --check-only -t zdtm/static/s390x_regs_check */ #define NR_THREADS 2 #define NR_THREADS_ALL (NR_THREADS + 1) diff --git a/test/zdtm/static/scm00.c b/test/zdtm/static/scm00.c index d669755828..670e6fd6a4 100644 --- a/test/zdtm/static/scm00.c +++ b/test/zdtm/static/scm00.c @@ -105,6 +105,9 @@ int main(int argc, char **argv) p[1] = p[0]; p[0] = -1; #endif +#endif +#ifdef CLOSE_SENDER_FD + close(sk[0]); #endif test_daemon(); diff --git a/test/zdtm/static/scm09.c b/test/zdtm/static/scm09.c new file mode 120000 index 0000000000..4cab0edd20 --- /dev/null +++ b/test/zdtm/static/scm09.c @@ -0,0 +1 @@ +scm00.c \ No newline at end of file diff --git a/test/zdtm/static/shm-hugetlb.checkskip b/test/zdtm/static/shm-hugetlb.checkskip new file mode 100755 index 0000000000..df23708156 --- /dev/null +++ b/test/zdtm/static/shm-hugetlb.checkskip @@ -0,0 +1,4 @@ +#!/bin/bash + +# will fail with EOPNOTSUPP +cat /proc/sys/vm/nr_hugepages &> /dev/null diff --git a/test/zdtm/static/sk-unix-listen01.c b/test/zdtm/static/sk-unix-listen01.c new file mode 100644 index 0000000000..5c9274acb0 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen01.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test in-flight unix sockets with data in them\n"; +const char *test_author = "Andrei Vagin "; + +#define SK_DATA "packet" + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +#define TEST_MODE 0640 + +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + +int main(int argc, char *argv[]) +{ + struct sockaddr_un addr; + unsigned int addrlen; + int ssk, sk; + + char path[PATH_MAX]; + char *cwd; + int ret; + + test_init(argc, argv); + + cwd = get_current_dir_name(); + if (!cwd) + return pr_perror("get_current_dir_name"); + + snprintf(path, sizeof(path), "%s/%s", cwd, filename); + unlink(path); + + addr.sun_family = AF_UNIX; + addrlen = strlen(filename); + if (addrlen > sizeof(addr.sun_path)) + return pr_err("address is too long"); + memcpy(addr.sun_path, filename, addrlen); + addrlen += sizeof(addr.sun_family); + + ssk = socket(AF_UNIX, SOCK_TYPE, 0); + if (ssk == -1) + return pr_perror("socket"); + + sk = socket(AF_UNIX, SOCK_TYPE, 0); + if (sk < 0) + return pr_perror("socket"); + + ret = bind(ssk, (struct sockaddr *)&addr, addrlen); + if (ret) + return pr_perror("bind"); + + ret = listen(ssk, 16); + if (ret) + return pr_perror("listen"); + + if (connect(sk, (struct sockaddr *)&addr, addrlen)) + return pr_perror("connect"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + memset(buf, 0, sizeof(buf)); + write(sk, SK_DATA, sizeof(SK_DATA)); + } +#endif + +#ifdef SK_UNIX_LISTEN03 + close(sk); + sk = -1; +#endif + + test_daemon(); + test_waitsig(); + + if (sk != -1) + close(sk); + + ret = accept(ssk, NULL, NULL); + if (ret < 0) + return fail("accept"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + if (read(ret, &buf, sizeof(buf)) != sizeof(SK_DATA)) + return pr_perror("read"); + + if (strcmp(buf, SK_DATA)) + return fail("data corrupted"); + } +#endif + + close(ssk); + unlink(path); + + pass(); + return 0; +} diff --git a/test/zdtm/static/sk-unix-listen02.c b/test/zdtm/static/sk-unix-listen02.c new file mode 120000 index 0000000000..1211f46660 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen02.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/unlink_largefile.desc b/test/zdtm/static/sk-unix-listen02.desc similarity index 100% rename from test/zdtm/static/unlink_largefile.desc rename to test/zdtm/static/sk-unix-listen02.desc diff --git a/test/zdtm/static/sk-unix-listen03.c b/test/zdtm/static/sk-unix-listen03.c new file mode 120000 index 0000000000..1211f46660 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen03.desc b/test/zdtm/static/sk-unix-listen03.desc new file mode 100644 index 0000000000..ded89879a9 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-listen04.c b/test/zdtm/static/sk-unix-listen04.c new file mode 120000 index 0000000000..1211f46660 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen04.desc b/test/zdtm/static/sk-unix-listen04.desc new file mode 100644 index 0000000000..ded89879a9 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/stopped03.c b/test/zdtm/static/stopped03.c new file mode 100644 index 0000000000..9a373930fe --- /dev/null +++ b/test/zdtm/static/stopped03.c @@ -0,0 +1,161 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +#define STOP_SIGNO SIGTSTP +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_CHECK, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} *sh; + +static int new_pgrp(void) +{ + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, STOP_SIGNO)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_CHECK); + + infop.si_code = 0; + infop.si_status = 0; + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + futex_set_and_wake(&sh->fstate, TEST_DONE); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + pr_err("Process is not in correct state before C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + futex_set_and_wake(&sh->fstate, TEST_CHECK); + futex_wait_while_lt(&sh->fstate, TEST_DONE); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + fail = 1; + pr_err("Process is not in correct state after C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} diff --git a/test/zdtm/static/stopped04.c b/test/zdtm/static/stopped04.c new file mode 100644 index 0000000000..9bd968aa2b --- /dev/null +++ b/test/zdtm/static/stopped04.c @@ -0,0 +1,135 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} *sh; + +static int new_pgrp(void) +{ + sigset_t sigset; + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + sigemptyset(&sigset); + sigaddset(&sigset, SIGTSTP); + sigprocmask(SIG_BLOCK, &sigset, NULL); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, SIGSTOP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + if (kill(pid, SIGTSTP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile index 98440f4e2c..ab735bdd4e 100644 --- a/test/zdtm/transition/Makefile +++ b/test/zdtm/transition/Makefile @@ -25,6 +25,7 @@ TST_NOFILE = \ pidfd_store_sk \ rseq01 \ rseq02 \ + stack \ TST_FILE = \ diff --git a/test/zdtm/transition/maps007.c b/test/zdtm/transition/maps007.c index 8a605cfe03..35c196bc43 100644 --- a/test/zdtm/transition/maps007.c +++ b/test/zdtm/transition/maps007.c @@ -38,7 +38,7 @@ int main(int argc, char **argv) struct { futex_t delta; futex_t stop; - } * shm; + } *shm; uint32_t v; unsigned long long count = 0; int i; diff --git a/test/zdtm/transition/stack.c b/test/zdtm/transition/stack.c new file mode 100644 index 0000000000..9548b91822 --- /dev/null +++ b/test/zdtm/transition/stack.c @@ -0,0 +1,16 @@ +#include "zdtmtst.h" + +const char *test_doc = "Tests that parasite code does not write past the start of the stack"; +const char *test_author = "Younes Manton "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 0e8eeff8a3..5e849b904b 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -93,44 +93,50 @@ static int create_timens(void) int main(int argc, char **argv) { + uid_t uid; pid_t pid; int status; + uid = getuid(); + /* * pidns is used to avoid conflicts * mntns is used to mount /proc * net is used to avoid conflicts of parasite sockets */ - if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) - return 1; + if (!uid) + if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) + return 1; pid = fork(); if (pid == 0) { - if (create_timens()) - exit(1); - if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { - fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); - return 1; - } - umount2("/proc", MNT_DETACH); - umount2("/dev/pts", MNT_DETACH); - if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { - fprintf(stderr, "mount(/proc): %m"); - return 1; + if (!uid) { + if (create_timens()) + exit(1); + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { + fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); + return 1; + } + umount2("/proc", MNT_DETACH); + umount2("/dev/pts", MNT_DETACH); + if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { + fprintf(stderr, "mount(/proc): %m"); + return 1; + } + if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { + fprintf(stderr, "mount(pts): %m"); + return 1; + } + if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { + fprintf(stderr, "mount(binfmt_misc): %m"); + return 1; + } + if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { + fprintf(stderr, "mount(ptmx): %m"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; } - if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { - fprintf(stderr, "mount(pts): %m"); - return 1; - } - if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { - fprintf(stderr, "mount(binfmt_misc): %m"); - return 1; - } - if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { - fprintf(stderr, "mount(ptmx): %m"); - return 1; - } - if (system("ip link set up dev lo")) - return 1; execv(argv[1], argv + 1); fprintf(stderr, "execve: %m"); return 1;