diff --git a/.cirrus.yml b/.cirrus.yml index bd4799fd0b..adaa9be334 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,10 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel + # The image has a too old version of nettle which does not work with gnutls. + # Just upgrade to the latest to make the error go away. + dnf -y upgrade nettle nettle-devel systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is @@ -108,7 +111,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-importlib-metadata python3-junit_xml xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed @@ -120,38 +123,6 @@ task: build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" -task: - name: CentOS 7 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-7 - platform: linux - cpu: 4 - memory: 8G - - setup_script: | - # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. - # Do not fail if latest epel repository definition is already installed - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - # Enable user namespaces on CentOS 7 - echo 10000 > /proc/sys/user/max_user_namespaces - # Adapt sudoers to our needs - echo 'root ALL=(ALL:ALL) ALL' | EDITOR='tee -a' visudo - - build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" - task: name: aarch64 build GCC (native) arm_container: diff --git a/.clang-format b/.clang-format index 4756380158..fb40bc613b 100644 --- a/.clang-format +++ b/.clang-format @@ -53,7 +53,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false -ColumnLimit: 120 +ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false diff --git a/.codespellrc b/.codespellrc index 765dacfabb..dd31dd851c 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml new file mode 100644 index 0000000000..be2fbd2856 --- /dev/null +++ b/.github/workflows/check-commits.yml @@ -0,0 +1,30 @@ +name: Verify self-contained commits + +on: pull_request + +# Cancel any preceding run on the pull request +concurrency: + group: commit-test-${{ github.event.pull_request.number }} + +jobs: + build: + runs-on: ubuntu-latest + # Check if pull request does not have label "not-selfcontained-ok" + if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" + steps: + - uses: actions/checkout@v3 + with: + # Needed to rebase against the base branch + fetch-depth: 0 + # Checkout pull request HEAD commit instead of merge commit + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + - name: Configure git user details + run: | + git config --global user.email "checkpoint-restore@users.noreply.github.com" + git config --global user.name "checkpoint-restore" + - name: Configure base branch without switching current branch + run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + - name: Build each commit + run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e18f921f3e..f52bce8123 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -26,15 +26,15 @@ jobs: run: make lint - name: Run make indent - run: > + continue-on-error: true + run: | if [ -z "${{github.base_ref}}" ]; then - git fetch --deepen=1 && - if ! make indent OPTS=--diff; then - exit 1 - fi + git fetch --deepen=1 + make indent else - git fetch origin ${{github.base_ref}} && - if ! make indent OPTS=--diff BASE=origin/${{github.base_ref}}; then - exit 1 - fi + git fetch origin ${{github.base_ref}} + make indent BASE=origin/${{github.base_ref}} fi + - name: Raise in-line make indent warnings + run: | + git diff | ./scripts/github-indent-warnings.py diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml new file mode 100644 index 0000000000..ba22fa25ff --- /dev/null +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -0,0 +1,15 @@ +name: LoongArch64 Qemu Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v2 + - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/.github/workflows/manage-labels.yml b/.github/workflows/manage-labels.yml new file mode 100644 index 0000000000..a2bcd88604 --- /dev/null +++ b/.github/workflows/manage-labels.yml @@ -0,0 +1,14 @@ +name: Remove labels +on: [issue_comment, pull_request_review_comment] +jobs: + remove-labels-on-comments: + name: Remove labels on comments + if: github.event_name == 'issue_comment' + runs-on: ubuntu-latest + steps: + - uses: mondeja/remove-labels-gh-action@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + labels: | + changes requested + awaiting reply diff --git a/.gitignore b/.gitignore index 1ea828bbcd..854657d1c1 100644 --- a/.gitignore +++ b/.gitignore @@ -25,13 +25,6 @@ images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest -crit/crit -criu/arch/*/sys-exec-tbl*.c -# x86 syscalls-table is not generated -!criu/arch/x86/sys-exec-tbl.c -criu/arch/*/syscalls*.S -criu/include/syscall-codes*.h -criu/include/syscall*.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h diff --git a/.lgtm.yml b/.lgtm.yml index a884a53ef1..0dd49cda41 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -22,7 +22,6 @@ extraction: - "libbsd-dev" - "python3-yaml" - "libnl-route-3-dev" - - "python-future" - "gnutls-dev" configure: command: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 87da08b343..a70506bfbf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,19 +46,18 @@ This should create the `./criu/criu` executable. ## Edit the source code -If you use ctags, you can generate the ctags file by running - -``` - make tags -``` - When you change the source code, please keep in mind the following code conventions: +* code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches * we prefer tabs and indentations to be 8 characters width -* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community. +* we prefer line length of 80 characters or less, more is allowed if it helps with code readability +* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community -Other conventions can be learned from the source code itself. In short, make sure your new code -looks similar to what is already there. +Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. + +## Automatic tools to fix coding-style + +Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). @@ -90,6 +89,41 @@ to check the last *N* commits for formatting errors, without applying the change Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected we need to review the suggested changes and decide if they should be fixed before merging. +Here are some bad examples of clang-format-ing: + +* if clang-format tries to force 120 characters and breaks readability - it is wrong: + +``` +@@ -58,8 +59,7 @@ static int register_membarriers(void) + } + + if (!all_ok) { +- fail("can't register membarrier()s - tried %#x, kernel %#x", +- barriers_registered, barriers_supported); ++ fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); + return -1; + } +``` + +* if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: + +``` +--- a/test/zdtm/static/membarrier.c ++++ b/test/zdtm/static/membarrier.c +@@ -27,9 +27,10 @@ static const struct { + int register_cmd; + int execute_cmd; + } membarrier_cmds[] = { +- { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, +- { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, +- { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, ++ { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, ++ { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, ++ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, ++ { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + }; +``` + ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run diff --git a/Documentation/Makefile b/Documentation/Makefile index 5085514501..de0cc448dc 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,11 +12,9 @@ endif FOOTER := footer.txt SRC1 += crit.txt -ifeq ($(PYTHON),python3) SRC1 += criu-ns.txt -endif SRC1 += compel.txt -SRC1 += amdgpu_plugin.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/Documentation/amdgpu_plugin.txt b/Documentation/criu-amdgpu-plugin.txt similarity index 85% rename from Documentation/amdgpu_plugin.txt rename to Documentation/criu-amdgpu-plugin.txt index 0d490b4292..35321a9159 100644 --- a/Documentation/amdgpu_plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -3,7 +3,7 @@ ROCM Support(1) NAME ---- -amdgpu_plugin - A plugin extension to CRIU to support checkpoint/restore in +criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in userspace for AMD GPUs. @@ -22,7 +22,7 @@ Though *criu* is a great tool for checkpointing and restoring running applications, it has certain limitations such as it cannot handle applications that have device files open. In order to support *ROCm* based workloads with *criu* we need to augment criu's core functionality with a -plugin based extension mechanism. *amdgpu_plugin* provides the necessary support +plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support to criu to allow Checkpoint / Restore with ROCm. @@ -97,6 +97,15 @@ executing criu command. E.g: KFD_CAPABILITY_CHECK=1 +*KFD_MAX_BUFFER_SIZE*:: + On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping + and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) + to set a limit on the plugin's memory usage. + Default:0 (Disabled) + + E.g: + KFD_MAX_BUFFER_SIZE="2G" + AUTHOR ------ diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 0e7d19c4cd..d1d34045a0 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,11 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty + *query-ext-files*::: + called after the process tree is stopped and network is locked. + This hook is used only in the RPC mode. The notification reply + contains file ids to be added to external file list (may be empty). + *--unprivileged*:: This option tells *criu* to accept the limitations when running as non-root. Running as non-root requires *criu* at least to have @@ -364,6 +369,14 @@ mount -t cgroup -o devices,freezer none devices,freezer Deduplicate "old" data in pages images of previous *dump*. This option implies incremental *dump* mode (see the *pre-dump* command). +*--skip-zero-pages*:: + Don't dump pages containing only zero bytes. This is a + potentially expensive operation because it checks for + every single process page if it contains only zeros, but + it can significantly decrease the image size and improve the + startup-time if many such pages exist. It effectively + replaces such pages which the kernel's zero-page on restore. + *-l*, *--file-locks*:: Dump file locks. It is necessary to make sure that all file lock users are taken into dump, so it is only safe to use this for enclosed containers @@ -457,6 +470,9 @@ The 'mode' may be one of the following: *nftables*::: Use nftables rules to drop the packets. + *skip*::: Don't lock the network. If *--tcp-close* is not used, the network + must be locked externally to allow CRIU to dump TCP connections. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. diff --git a/Makefile b/Makefile index 8061a42c45..ff0ca92db6 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -35,18 +35,18 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp endif ifeq ($(ARMV),8) # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif @@ -80,6 +80,10 @@ ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif +ifeq ($(ARCH),loongarch64) + DEFINES := -DCONFIG_LOONGARCH64 +endif + # # CFLAGS_PIE: # @@ -102,6 +106,7 @@ export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_LARGEFILE64_SOURCE DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes @@ -122,6 +127,10 @@ ifeq ($(ARCH),mips) WARNINGS := -rdynamic endif +ifeq ($(ARCH),loongarch64) +WARNINGS += -Wno-implicit-function-declaration +endif + ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) @@ -151,12 +160,12 @@ export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ -CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: flog criu lib crit .PHONY: all # @@ -242,6 +251,15 @@ soccr/built-in.o: $(CONFIG_HEADER) .FORCE $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) +#flog gets used by criu, build it earlier + +flogMakefile: ; +flog%: + $(Q) $(MAKE) $(build)=flog $@ +flog: + $(Q) $(MAKE) $(build)=flog all +.PHONY: flog + # # CRIU building done in own directory # with slightly different rules so we @@ -259,38 +277,31 @@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu -crit/Makefile: ; -crit/%: criu .FORCE - $(Q) $(MAKE) $(build)=crit $@ -crit: criu - $(Q) $(MAKE) $(build)=crit all -.PHONY: crit - unittest: $(criu-deps) $(Q) $(MAKE) $(build)=criu unittest .PHONY: unittest # -# Libraries next once crit it ready +# Libraries next once criu is ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; -lib/%: crit .FORCE +lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ -lib: crit +lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib clean mrproper: $(Q) $(MAKE) $(build)=images $@ + $(Q) $(MAKE) $(build)=flog $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ - $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper clean-amdgpu_plugin: @@ -337,6 +348,10 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +crit: lib + $(Q) $(MAKE) -C crit +.PHONY: crit + # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -402,6 +417,7 @@ help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' + @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @@ -425,12 +441,15 @@ lint: flake8 --config=scripts/flake8.cfg test/zdtm.py flake8 --config=scripts/flake8.cfg test/inhfd/*.py flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py - flake8 --config=scripts/flake8.cfg lib/py/images/images.py + flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py + flake8 --config=scripts/flake8.cfg lib/pycriu/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns - flake8 --config=scripts/flake8.cfg crit/setup.py + flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py + flake8 --config=scripts/flake8.cfg crit/*.py + flake8 --config=scripts/flake8.cfg crit/crit/*.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py - flake8 --config=scripts/flake8.cfg coredump/ + flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump + flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install @@ -438,13 +457,14 @@ lint: shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh + shellcheck -x test/others/action-script/*.sh codespell -S tags - # Do not append \n to pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' - # Do not use %m with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|debug|info|msg)|fail)\>.*%m' - # Do not use errno with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>\(".*".*errno' + # Do not append \n to pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' + # Do not use %m with pr_* or fail + ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' + # Do not use errno with pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files diff --git a/Makefile.config b/Makefile.config index 270ec61c0f..8f2b5208e0 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,14 +78,15 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW MEMFD_CREATE \ + SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ - $(Q) echo '' >> $$@ +else + $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ endif endef diff --git a/Makefile.install b/Makefile.install index c798637beb..6f5b31924d 100644 --- a/Makefile.install +++ b/Makefile.install @@ -37,6 +37,10 @@ install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib +install-crit: lib + $(Q) $(MAKE) $(build)=crit install +.PHONY: install-crit + install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu @@ -50,12 +54,13 @@ install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/README.md b/README.md index ff4aa1a239..11d1c490b6 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Pages worth starting with are: - Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) ### Checkpoint and restore of simple loop process -[

](https://asciinema.org/a/232445) +

## Advanced features diff --git a/compel/Makefile b/compel/Makefile index b79aee6871..78ec4826af 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -33,7 +33,7 @@ lib-y += arch/$(ARCH)/src/lib/thread_area.o endif # handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64,$(ARCH)),) +ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index f8ec55d6c0..9152024fd8 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -18,7 +18,7 @@ struct aux_context { struct _aarch64_ctx end; }; -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 8bcc3cc50a..217e346a31 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -39,7 +39,7 @@ recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, str sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) -bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +bind 200 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) @@ -122,3 +122,4 @@ pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/loongarch64/plugins/include/asm/prologue.h b/compel/arch/loongarch64/plugins/include/asm/prologue.h new file mode 100644 index 0000000000..c19ce54d7a --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000000..b883bd8bed --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h @@ -0,0 +1,30 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#include +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +/* refer to arch/loongarch/include/uapi/asm/signal.h */ +#define _KNSIG 64 +#define _NSIG_BPW BITS_PER_LONG +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#define SA_RESTORER 0x04000000 + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/features.h b/compel/arch/loongarch64/plugins/include/features.h new file mode 100644 index 0000000000..b4a3cded2b --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/loongarch64/plugins/std/parasite-head.S b/compel/arch/loongarch64/plugins/std/parasite-head.S new file mode 100644 index 0000000000..3a960490eb --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/parasite-head.S @@ -0,0 +1,9 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + bl parasite_service; + break 0; +END(__export_parasite_head_start) + diff --git a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000000..0d08f34e1d --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S new file mode 100644 index 0000000000..fff8944669 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S @@ -0,0 +1,44 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ +ENTRY(name); \ + addi.d $a7, $zero, opcode; \ + syscall 0; \ + jirl $r0, $r1, 0; \ +END(name) + +#ifndef AT_FDCWD +#define AT_FDCWD -100 +#endif + +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif + +ENTRY(sys_open) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_openat +END(sys_open) + +ENTRY(sys_mkdir) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_mkdirat +END(sys_mkdir) + +ENTRY(sys_rmdir) + addi.d $a2, $zero, AT_REMOVEDIR + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_unlinkat +END(sys_rmdir) + +ENTRY(__cr_restore_rt) + addi.d $a7, $zero, __NR_rt_sigreturn + syscall 0 +END(__cr_restore_rt) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 0000000000..aa6ffb44d1 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,122 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) diff --git a/compel/arch/loongarch64/scripts/compel-pack.lds.S b/compel/arch/loongarch64/scripts/compel-pack.lds.S new file mode 100644 index 0000000000..cfb7a2fb35 --- /dev/null +++ b/compel/arch/loongarch64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(loongarch) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} diff --git a/compel/arch/loongarch64/src/lib/cpu.c b/compel/arch/loongarch64/src/lib/cpu.c new file mode 100644 index 0000000000..172b90e275 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/cpu.c @@ -0,0 +1,41 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf-host.c b/compel/arch/loongarch64/src/lib/handle-elf-host.c new file mode 100644 index 0000000000..a605a5a452 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf-host.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf.c b/compel/arch/loongarch64/src/lib/handle-elf.c new file mode 100644 index 0000000000..a605a5a452 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/include/handle-elf.h b/compel/arch/loongarch64/src/lib/include/handle-elf.h new file mode 100644 index 0000000000..b0a66ef879 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/syscall.h b/compel/arch/loongarch64/src/lib/include/syscall.h new file mode 100644 index 0000000000..ac3e2799ac --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif + +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000000..21eb1309f2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000000..e568df789c --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000000..7f476d541a --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000000..0b047a5b08 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,67 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h + * + * A thread LoongArch CPU context + * + * struct user_fp_state { + * uint64_t fpr[32]; + * uint64_t fcc; + * uint32_t fcsr; + * }; + * + * struct user_pt_regs { + * unsigned long regs[32]; + * unsigned long csr_era; + * unsigned long csr_badv; + * unsigned long reserved[11]; + * }; + */ + +struct user_gp_regs { + uint64_t regs[32]; + uint64_t orig_a0; + uint64_t pc; + uint64_t csr_badv; + uint64_t reserved[10]; +} __attribute__((aligned(8))); + +struct user_fp_regs { + uint64_t regs[32]; + uint64_t fcc; + uint32_t fcsr; +}; + +typedef struct user_gp_regs user_regs_struct_t; +typedef struct user_fp_regs user_fpregs_struct_t; + +#define user_regs_native(regs) true + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(r) ((uint64_t)(r).regs[4]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)(r).regs[3]) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) +#define SET_REG_IP(r, val) ((r).pc = (val)) + +#define GPR_NUM 32 +#define FPR_NUM 32 + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000000..fcb545a1d2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,86 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +#include +#include + +#include + +#define rt_sigcontext sigcontext +/* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ +#include +typedef __u32 u32; + +typedef struct sigcontext_t { + __u64 pc; + __u64 regs[32]; + __u32 flags; + __u64 extcontext[0] __attribute__((__aligned__(16))); +} sigcontext_t; + +typedef struct context_info_t { + __u32 magic; + __u32 size; + __u64 padding; +} context_info_t; + +#define FPU_CTX_MAGIC 0x46505501 +#define FPU_CTX_ALIGN 8 +typedef struct fpu_context_t { + __u64 regs[32]; + __u64 fcc; + __u64 fcsr; +} fpu_context_t; + +typedef struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + sigset_t uc_sigmask; + __u8 __unused[1024 / 8 - sizeof(sigset_t)]; + sigcontext_t uc_mcontext; +} ucontext; + +/* Copy from the kernel source arch/loongarch/kernel/signal.c */ +struct rt_sigframe { + rt_siginfo_t rs_info; + ucontext rs_uc; +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) +#define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) +#define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) + +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ({ \ + context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ + ctx->magic = FPU_CTX_MAGIC; \ + ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ + (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ + }) + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "addi.d $sp, %0, 0 \n" \ + "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall 0" \ + : \ + :"r"(new_sp) \ + : "$a7", "memory") +/* clang-format on */ + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c new file mode 100644 index 0000000000..8e3c19aff2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" + +/* + * Injected syscall instruction + * loongarch64 is Little Endian + */ +const char code_syscall[] = { + 0x00, 0x00, 0x2b, 0x00, /* syscall */ + 0x00, 0x00, 0x2a, 0x00 /* break */ +}; + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigcontext_t *sc; + fpu_context_t *fpu; + + sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, regs->regs, sizeof(regs->regs)); + sc->pc = regs->pc; + + fpu = RT_SIGFRAME_FPU(sigframe); + memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + /* + * Refer to Linux kernel arch/loongarch/kernel/signal.c + */ + if (regs->regs[0]) { + switch (regs->regs[4]) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->regs[4] = regs->orig_a0; + regs->pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs->regs[4] = regs->orig_a0; + regs->regs[11] = __NR_restart_syscall; + regs->pc -= 4; + break; + } + regs->regs[0] = 0; /* Don't deal with this again. */ + } + + iov.iov_base = fpregs; + iov.iov_len = sizeof(user_fpregs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(arg, regs, fpregs); +err: + return 0; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +/* + * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is + * used as syscall number. + */ +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + int err; + user_regs_struct_t regs = ctl->orig.regs; + + regs.regs[11] = (unsigned long)nr; + regs.regs[4] = arg1; + regs.regs[5] = arg2; + regs.regs[6] = arg3; + regs.regs[7] = arg4; + regs.regs[8] = arg5; + regs.regs[9] = arg6; + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[4]; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->regs[4] = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * TODO: add feature + */ +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +/* + * Refer to Linux kernel arch/loongarch/include/asm/processor.h + */ +#define TASK_SIZE32 (1UL) << 31 +#define TASK_SIZE64_MIN (1UL) << 40 +#define TASK_SIZE64_MAX (1UL) << 48 + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 505ec849d7..9f50d5e8ad 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -119,3 +119,4 @@ __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index af40d71045..4c9b75cf1b 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -118,3 +118,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index 8cc94ba740..0c4ccb6486 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -14,7 +14,7 @@ */ #include -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index db999ce37f..1603ac92e5 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -11,6 +11,7 @@ #include "log.h" #include "common/bug.h" #include "common/page.h" +#include "common/err.h" #include "infect.h" #include "infect-priv.h" @@ -303,33 +304,58 @@ static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) return -1; /* still failing the checkpoint */ } -static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - pr_info("Dumping GP/FPU registers for %d\n", pid); +/* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ - /* - * This is inspired by kernel function check_syscall_restart in - * arch/powerpc/kernel/signal.c - */ #ifndef TRAP #define TRAP(r) ((r).trap & ~0xF) #endif - if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { - /* Restart the system call */ - switch (regs->gpr[3]) { - case ERESTARTNOHAND: - case ERESTARTSYS: - case ERESTARTNOINTR: - regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; - break; - case ERESTART_RESTARTBLOCK: - pr_warn("Will restore %d with interrupted system call\n", pid); - regs->gpr[3] = EINTR; - break; - } +static bool trap_is_scv(user_regs_struct_t *regs) +{ + return TRAP(*regs) == 0x3000; +} + +static bool trap_is_syscall(user_regs_struct_t *regs) +{ + return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; +} + +static void handle_syscall(pid_t pid, user_regs_struct_t *regs) +{ + unsigned long ret = regs->gpr[3]; + + if (trap_is_scv(regs)) { + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { + return; + } + + /* Restart or interrupt the system call */ + switch (ret) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; + break; } +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (trap_is_syscall(regs)) + handle_syscall(pid, regs); /* Resetting trap since we are now coming from user space. */ regs->trap = 0; diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 6a349e1cb7..af7d550e2c 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -118,3 +118,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index a119a59b2e..ab36a5cd6f 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -106,3 +106,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 16dd86e791..57681b79a7 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -117,3 +117,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index bd3b0cbd5c..8c83dd9ae4 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -21,7 +21,28 @@ #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 -#define XSAVE_SIZE 4096 +/* + * This used to be 4096 (one page). There is a comment below concerning + * this size: + * "One page should be enough for the whole xsave state ;-)" + * Which is kind of funny as it is no longer enough ;-) + * + * Older CPUs: + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) + * + * Newer CPUs (Sapphire Rapids): + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) + * + * So one page is no longer enough... But: + * + * Four pages should be enough for the whole xsave state ;-) + */ + +#define XSAVE_SIZE 4*4096 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE @@ -235,8 +256,11 @@ struct pkru_state { * * * One page should be enough for the whole xsave state ;-) + * + * Of course it was not ;-) Now using four pages... + * */ -#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) /* * cpu requires it to be 64 byte aligned diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 01959b95b2..88bdb4047e 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -220,6 +220,16 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) +{ + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} + static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { struct iovec iov; @@ -232,14 +242,15 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) return -1; } - return 0; -} - -static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) -{ - if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { - pr_perror("Can't obtain FPU registers for %d", pid); - return -1; + if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { + // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. + // Since those are restored unconditionally, make sure the init values are + // filled by retrying with old PTRACE_GETFPREGS. + // + // [1] Intel® 64 and IA-32 Architectures Software Developer's + // Manual Volume 1: Basic Architecture + // Section 13.6: Processor tracking of XSAVE-managed state + return get_task_fpregs(pid, xsave); } return 0; diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index abecc140f1..60b21d3132 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -27,7 +27,7 @@ static struct rt_sigframe *sigframe; */ static unsigned __page_size; -unsigned __attribute((weak)) page_size(void) +unsigned long __attribute((weak)) page_size(void) { return __page_size; } diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 5aab7aa3ee..b9a913fa1e 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -589,7 +589,7 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t } if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); goto err; } @@ -1398,7 +1398,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); return -1; } @@ -1577,7 +1577,7 @@ int compel_stop_pie(pid_t pid, void *addr, bool no_bp) int ret; if (no_bp) { - pr_debug("Force no-breakpoints restore\n"); + pr_debug("Force no-breakpoints restore of %d\n", pid); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); diff --git a/compel/src/main.c b/compel/src/main.c index ef05a46d01..bc16c0ab41 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -57,6 +57,9 @@ static const flags_t flags = { #elif defined CONFIG_MIPS .arch = "mips", .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_LOONGARCH64 + .arch = "loongarch64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 7f20ea2a7f..41de99e200 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -110,11 +110,11 @@ static int check_pipe_ends(int wfd, int rfd) printf("Check pipe ends are connected\n"); if (write(wfd, "1", 2) != 2) { fprintf(stderr, "write to pipe failed\n"); - return -1; + return 0; } if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { fprintf(stderr, "read from pipe failed\n"); - return -1; + return 0; } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst index c2d1509fa1..ce45f1b7cf 100644 --- a/contrib/debian/dev-packages.lst +++ b/contrib/debian/dev-packages.lst @@ -17,4 +17,3 @@ libcap-dev libaio-dev python3-yaml libnl-route-3-dev -python-future diff --git a/coredump/coredump.py b/coredump/coredump old mode 100644 new mode 100755 similarity index 98% rename from coredump/coredump.py rename to coredump/coredump index 88a1b374c6..f70d37c13b --- a/coredump/coredump.py +++ b/coredump/coredump @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import os import sys diff --git a/coredump/coredump-python2 b/coredump/coredump-python2 deleted file mode 100755 index 564c05ce9f..0000000000 --- a/coredump/coredump-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -import coredump - -if __name__ == '__main__': - coredump.main() diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 8ee4026768..20ec8e5dc8 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -35,12 +35,6 @@ from pycriu import images from . import elf - -try: - from itertools import ifilter as filter -except ImportError: - pass - # Some memory-related constants PAGESIZE = 4096 status = { @@ -59,6 +53,7 @@ "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, + "VMA_AREA_MEMFD": 1 << 14, "VMA_AREA_UNSUPP": 1 << 31 } @@ -318,10 +313,7 @@ def _gen_prpsinfo(self, pid): # prpsinfo.pr_psargs has a limit of 80 characters which means it will # fail here if the cmdline is longer than 80 prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] - if (sys.version_info > (3, 0)): - prpsinfo.pr_fname = core["tc"]["comm"].encode() - else: - prpsinfo.pr_fname = core["tc"]["comm"] + prpsinfo.pr_fname = core["tc"]["comm"].encode() nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -581,10 +573,7 @@ class elf_files(ctypes.Structure): setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - if (sys.version_info > (3, 0)): - setattr(data, "name" + str(i), info.name.encode()) - else: - setattr(data, "name" + str(i), info.name) + setattr(data, "name" + str(i), info.name.encode()) nhdr = elf.Elf64_Nhdr() diff --git a/coredump/pycriu b/coredump/pycriu index d13a8790a9..d1b6ed5c45 120000 --- a/coredump/pycriu +++ b/coredump/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/crit/.gitignore b/crit/.gitignore index 810661179d..10c8ab1869 100644 --- a/crit/.gitignore +++ b/crit/.gitignore @@ -1,2 +1,4 @@ crit.egg-info/ build/ +dist/ +version.py diff --git a/crit/Makefile b/crit/Makefile index 988b481b63..9a856db6d2 100644 --- a/crit/Makefile +++ b/crit/Makefile @@ -1,13 +1,40 @@ +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES := 0 -all-y += crit +VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) -crit/crit: crit/crit-$(PYTHON) - $(Q) cp $^ $@ -crit: crit/crit -.PHONY: crit +all-y += ${VERSION_FILE} +cleanup-y += ${VERSION_FILE} -clean-crit: - $(Q) $(RM) crit/crit -.PHONY: clean-crit -clean: clean-crit -mrproper: clean +${VERSION_FILE}: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ + +install: ${VERSION_FILE} +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +.PHONY: install + +uninstall: +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +.PHONY: uninstall diff --git a/crit/crit-python2 b/crit/crit-python2 deleted file mode 100755 index b0b7d3c3a0..0000000000 --- a/crit/crit-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/crit/crit-python3 b/crit/crit-python3 deleted file mode 100755 index 80467cba72..0000000000 --- a/crit/crit-python3 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/crit/crit/__init__.py b/crit/crit/__init__.py new file mode 100644 index 0000000000..58f3ace6c0 --- /dev/null +++ b/crit/crit/__init__.py @@ -0,0 +1 @@ +from .version import __version__ diff --git a/lib/py/cli.py b/crit/crit/__main__.py similarity index 94% rename from lib/py/cli.py rename to crit/crit/__main__.py index 5419384c3d..e15327f503 100755 --- a/lib/py/cli.py +++ b/crit/crit/__main__.py @@ -1,18 +1,17 @@ -from __future__ import print_function +#!/usr/bin/env python3 import argparse import sys import json import os import pycriu +from . import __version__ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: - if (sys.version_info < (3, 0)): - return sys.stdin if sys.stdin.isatty(): # If we are reading from a terminal (not a pipe) we want text input and not binary return sys.stdin @@ -28,8 +27,6 @@ def outf(opts, decode): mode = 'w+' return open(opts['out'], mode) else: - if (sys.version_info < (3, 0)): - return sys.stdout if decode: return sys.stdout return sys.stdout.buffer @@ -45,9 +42,9 @@ def decode(opts): try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: - print("Unknown magic %#x.\n"\ - "Maybe you are feeding me an image with "\ - "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + print("Unknown magic %#x.\n" + "Maybe you are feeding me an image with " + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: @@ -63,9 +60,9 @@ def encode(opts): try: img = json.load(inf(opts)) except UnicodeDecodeError: - print("Cannot read JSON.\n"\ - "Maybe you are feeding me an image with protobuf data? "\ - "Encode expects JSON input.", file=sys.stderr) + print("Cannot read JSON.\n" + "Maybe you are feeding me an image with protobuf data? " + "Encode expects JSON input.", file=sys.stderr) sys.exit(1) pycriu.images.dump(img, outf(opts, False)) @@ -135,7 +132,7 @@ def ftype_find_in_files(opts, ft, fid): if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] - except: + except Exception: files_img = [] if len(files_img) == 0: @@ -368,6 +365,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') @@ -377,8 +375,7 @@ def main(): 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', - help= - 'Multiline with indents and some numerical fields in field-specific format', + help='Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', diff --git a/crit/pycriu b/crit/pycriu deleted file mode 120000 index d13a8790a9..0000000000 --- a/crit/pycriu +++ /dev/null @@ -1 +0,0 @@ -../lib/py/ \ No newline at end of file diff --git a/crit/pyproject.toml b/crit/pyproject.toml index b1e1a4650a..9089f0a394 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -1,2 +1,22 @@ [build-system] requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "crit" +description = "CRiu Image Tool" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[project.scripts] +crit = "crit.__main__:main" + +[tool.setuptools] +packages = ["crit"] + +[tool.setuptools.dynamic] +version = {attr = "crit.__version__"} diff --git a/crit/setup.cfg b/crit/setup.cfg new file mode 100644 index 0000000000..fbc9a51439 --- /dev/null +++ b/crit/setup.cfg @@ -0,0 +1,20 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = crit +description = CRiu Image Tool +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: crit.__version__ + +[options] +packages = crit +python_requires = >=3.6 + +[options.entry_points] +console_scripts = + crit = crit.__main__:main diff --git a/crit/setup.py b/crit/setup.py index 1aaa73a130..618ac1de48 100644 --- a/crit/setup.py +++ b/crit/setup.py @@ -1,29 +1,6 @@ -import os -from setuptools import setup, find_packages +#!/usr/bin/env python3 +import setuptools -def get_version(): - version = '0.0.1' - env = os.environ - if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: - version = '{}.{}'.format( - env['CRIU_VERSION_MAJOR'], - env['CRIU_VERSION_MINOR'] - ) - if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: - version += '.' + env['CRIU_VERSION_SUBLEVEL'] - return version - - -setup( - name='crit', - version=get_version(), - description='CRiu Image Tool', - author='CRIU team', - author_email='criu@openvz.org', - license='GPLv2', - url='https://github.com/checkpoint-restore/criu', - packages=find_packages('.'), - scripts=['crit'], - install_requires=[], -) +if __name__ == '__main__': + setuptools.setup() diff --git a/criu/Makefile b/criu/Makefile index 55bdb1b7a3..bafdd980bb 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -85,7 +85,7 @@ $(obj)/%: pie $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ UNIT-BUILTINS += $(obj)/util.o UNIT-BUILTINS += $(obj)/config.o @@ -102,7 +102,7 @@ $(obj)/unittest/built-in.o: .FORCE $(obj)/unittest/unittest: $(UNIT-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ unittest: $(obj)/unittest/unittest $(Q) $(obj)/unittest/$@ @@ -145,10 +145,8 @@ install: $(obj)/criu $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts -ifeq ($(PYTHON),python3) $(E) " INSTALL " scripts/criu-ns $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR) -endif .PHONY: install uninstall: diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 13c346f449..7f6113c8f1 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,7 +6,6 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel -REQ-RPM-PKG-NAMES += $(PYTHON)-future REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -15,20 +14,16 @@ REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf -REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev -ifeq ($(PYTHON),python3) REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -else -REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml -endif + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 1ce6d9c108..6f79001864 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -18,6 +18,7 @@ #include "common/scm.h" static const char *action_names[ACT_MAX] = { + [ACT_PRE_STREAM] = "pre-stream", [ACT_PRE_DUMP] = "pre-dump", [ACT_POST_DUMP] = "post-dump", [ACT_PRE_RESTORE] = "pre-restore", @@ -30,6 +31,7 @@ static const char *action_names[ACT_MAX] = { [ACT_POST_RESUME] = "post-resume", [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", [ACT_STATUS_READY] = "status-ready", + [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { @@ -51,6 +53,9 @@ static int run_shell_scripts(const char *action) #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 + if (list_empty(&scripts)) + return 0; + if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; @@ -111,6 +116,20 @@ int rpc_send_fd(enum script_actions act, int fd) return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } +int rpc_query_external_files(void) +{ + int rpc_sk; + + if (scripts_mode != SCRIPTS_RPC) + return 0; + + rpc_sk = get_service_fd(RPC_SK_OFF); + if (rpc_sk < 0) + return -1; + + return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); +} + int run_scripts(enum script_actions act) { int ret = 0; @@ -118,23 +137,24 @@ int run_scripts(enum script_actions act) pr_debug("Running %s scripts\n", action); - if (scripts_mode == SCRIPTS_NONE) + switch (scripts_mode) { + case SCRIPTS_NONE: return 0; - - if (scripts_mode == SCRIPTS_RPC) { + case SCRIPTS_RPC: ret = rpc_send_fd(act, -1); - goto out; - } - - if (scripts_mode == SCRIPTS_SHELL) { + if (ret) + break; + /* Enable scripts from config file in RPC mode (fallthrough) */ + case SCRIPTS_SHELL: ret = run_shell_scripts(action); - goto out; + break; + default: + BUG(); } - BUG(); -out: if (ret) pr_err("One of more action scripts failed\n"); + return ret; } @@ -142,8 +162,9 @@ int add_script(char *path) { struct script *script; - BUG_ON(scripts_mode == SCRIPTS_RPC); - scripts_mode = SCRIPTS_SHELL; + /* Set shell mode when a script is added but don't overwrite RPC mode */ + if (scripts_mode == SCRIPTS_NONE) + scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) @@ -169,7 +190,6 @@ int add_rpc_notify(int sk) return -1; } - BUG_ON(scripts_mode == SCRIPTS_SHELL); scripts_mode = SCRIPTS_RPC; if (install_service_fd(RPC_SK_OFF, fd) < 0) diff --git a/criu/apparmor.c b/criu/apparmor.c index 9de54ce40b..e46e239f59 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -207,8 +207,6 @@ static int by_time(const struct dirent **de1, const struct dirent **de2) } else { if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) return -1; - if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) - return 0; return 1; } } @@ -551,8 +549,8 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit goto fail; } - ret = snprintf(path + offset + my_offset, sizeof(path) - offset - my_offset, "/.replace"); - if (ret < 0 || ret >= sizeof(path) - offset - my_offset) { + ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); + if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { pr_err("snprintf failed\n"); goto fail; } diff --git a/criu/arch/loongarch64/Makefile b/criu/arch/loongarch64/Makefile new file mode 100644 index 0000000000..4bd99eb7eb --- /dev/null +++ b/criu/arch/loongarch64/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/criu/arch/loongarch64/cpu.c b/criu/arch/loongarch64/cpu.c new file mode 100644 index 0000000000..5559c4288f --- /dev/null +++ b/criu/arch/loongarch64/cpu.c @@ -0,0 +1,31 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c new file mode 100644 index 0000000000..eeb0731ca6 --- /dev/null +++ b/criu/arch/loongarch64/crtools.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/restorer.h" +#include "asm/parasite-syscall.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +#define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + int i; + CoreEntry *core = x; + UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; + UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; + for (i = 0; i < GPR_NUM; i++) + assign_reg(gprs, regs, regs[i]); + assign_reg(gprs, regs, pc); + + for (i = 0; i < FPR_NUM; i++) + assign_reg(fpregs, fpregs, regs[i]); + assign_reg(fprs, fpregs, fcc); + assign_reg(fprs, fpregs, fcsr); + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoLoongarch64 *ti_loongarch64; + UserLoongarch64GpregsEntry *gpregs; + UserLoongarch64FpregsEntry *fpregs; + + ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); + thread_info_loongarch64__init(ti_loongarch64); + core->ti_loongarch64 = ti_loongarch64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_loongarch64_gpregs_entry__init(gpregs); + gpregs->n_regs = GPR_NUM; + gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); + if (!gpregs->regs) + goto err; + ti_loongarch64->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + goto err; + user_loongarch64_fpregs_entry__init(fpregs); + fpregs->n_regs = FPR_NUM; + fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); + if (!fpregs->regs) + goto err; + ti_loongarch64->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpregs) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); + UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; + + memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) +{ + sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, r->regs, sizeof(sc->regs)); + sc->pc = r->pc; + return 0; +} diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h new file mode 100644 index 0000000000..04347155c3 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_loongarch64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/int.h b/criu/arch/loongarch64/include/asm/int.h new file mode 100644 index 0000000000..642804e9b4 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/kerndat.h b/criu/arch/loongarch64/include/asm/kerndat.h new file mode 100644 index 0000000000..bb70cf6cf5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/parasite-syscall.h b/criu/arch/loongarch64/include/asm/parasite-syscall.h new file mode 100644 index 0000000000..6008c37923 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/loongarch64/include/asm/parasite.h b/criu/arch/loongarch64/include/asm/parasite.h new file mode 100644 index 0000000000..b64cb3185c --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite.h @@ -0,0 +1,11 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm volatile("or %0, $zero, $tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/loongarch64/include/asm/restore.h b/criu/arch/loongarch64/include/asm/restore.h new file mode 100644 index 0000000000..d956231c81 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restore.h @@ -0,0 +1,33 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ +({ \ + uint64_t save_sp; \ + asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ + asm volatile( \ + "or $a0, $zero, %2 \n" \ + "or $sp, $zero, %0 \n" \ + "jirl $ra, %1, 0 \n" \ + : \ + : "r"(new_sp & ~15), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "$a0", "memory"); \ + asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ +}) + +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_loongarch64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/loongarch64/include/asm/restorer.h b/criu/arch/loongarch64/include/asm/restorer.h new file mode 100644 index 0000000000..7a0d35c5b5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restorer.h @@ -0,0 +1,97 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld.d $a1, %2 \n" \ + "addi.d $a1, $a1, -16 \n" \ + "st.d %5, $a1, 0 \n" \ + "st.d %6, $a1, 8 \n" \ + "or $a0, $zero, %1 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "ld.d $a1, $sp, 0 \n" \ + "ld.d $a0, $sp, 8 \n" \ + "jirl $ra, $a1, 0 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "ZB"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(&clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + "or $a0, $zero, %1 \n" \ + "or $a1, $zero, %2 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, clone3_thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + "or $a0, $zero, $a3 \n" \ + "jirl $ra, $a2, 0 \n" \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") +/* clang-format on */ + +static inline void restore_tls(tls_t *ptls) +{ + asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); + +#define arch_map_vdso(map, compat) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/thread_pointer.h b/criu/arch/loongarch64/include/asm/thread_pointer.h new file mode 100644 index 0000000000..f7e07066a5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/loongarch64/include/asm/types.h b/criu/arch/loongarch64/include/asm/types.h new file mode 100644 index 0000000000..72bca2022b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/types.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" +#include "images/core.pb-c.h" + +#include + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 + +#define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) + +#define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) + +typedef UserLoongarch64GpregsEntry UserRegsEntry; + +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/loongarch64/include/asm/vdso.h b/criu/arch/loongarch64/include/asm/vdso.h new file mode 100644 index 0000000000..64631dee09 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/vdso.h @@ -0,0 +1,27 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 5 +#define VDSO_SYMBOL_GTOD 3 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ + const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ + const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 +#endif diff --git a/criu/arch/loongarch64/restorer.c b/criu/arch/loongarch64/restorer.c new file mode 100644 index 0000000000..730318ac14 --- /dev/null +++ b/criu/arch/loongarch64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) +{ + return 0; +} diff --git a/criu/arch/loongarch64/sigframe.c b/criu/arch/loongarch64/sigframe.c new file mode 100644 index 0000000000..18983ff138 --- /dev/null +++ b/criu/arch/loongarch64/sigframe.c @@ -0,0 +1,12 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/loongarch64/vdso-pie.c b/criu/arch/loongarch64/vdso-pie.c new file mode 100644 index 0000000000..7a75d2741d --- /dev/null +++ b/criu/arch/loongarch64/vdso-pie.c @@ -0,0 +1,48 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t pcaddi; + uint32_t ldptr; + uint32_t jirl; + uint32_t guards; + uint64_t imm64; + } __packed jmp = { + .pcaddi = 0x18000095, /* pcaddi $x, 4 */ + .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ + .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ + .guards = 0x002a0000, /* break 0 */ + .imm64 = to, + }; + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) +{ + unsigned int i; + unsigned long from, to; + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + insert_trampoline(from, to); + } + return 0; +} diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 4fa7eb3dc9..46612e70d3 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -23,7 +23,7 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *r } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; - } else if (!sigframe->is_native) { + } else { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; if ((addr % 64ul)) { diff --git a/criu/cgroup.c b/criu/cgroup.c index 8243ac6d3c..67282f269e 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -639,8 +639,8 @@ static int open_cgroupfs(struct cg_ctl *cc) return -1; } - if (mount("none", prefix, fstype, 0, mopts) < 0) { - pr_perror("Unable to mount %s", mopts); + if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { + pr_perror("Unable to mount %s %s", fstype, mopts); rmdir(prefix); return -1; } @@ -714,6 +714,8 @@ static int collect_cgroups(struct list_head *ctls) } } else { fd = open_cgroupfs(cc); + if (fd < 0) + return -1; } path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); @@ -1202,17 +1204,12 @@ static int prepare_cgns(CgSetEntry *se) return 0; } -static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) +static int move_in_cgroup(CgSetEntry *se) { int i; pr_info("Move into %d\n", se->id); - if (setup_cgns && prepare_cgns(se) < 0) { - pr_err("failed preparing cgns\n"); - return -1; - } - for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; @@ -1252,7 +1249,44 @@ static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) return 0; } -int prepare_task_cgroup(struct pstree_item *me) +int prepare_cgroup_namespace(struct pstree_item *root_task) +{ + CgSetEntry *se; + + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; + + if (root_task->parent) { + pr_err("Expecting root_task to restore cgroup namespace\n"); + return -1; + } + + /* + * If on dump all dumped tasks are in same cgset with criu we don't + * dump cgsets and thus cgroup namespaces and rely that on restore + * criu caller would prepare proper cgset/cgns for us. Also in case + * of --unprivileged we don't even have the root cgset here. + */ + if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { + pr_info("Cgroup namespace inherited from parent\n"); + return 0; + } + + se = find_rst_set_by_id(rsti(root_task)->cg_set); + if (!se) { + pr_err("No set %d found\n", rsti(root_task)->cg_set); + return -1; + } + + if (prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + + return 0; +} + +int restore_task_cgroup(struct pstree_item *me) { struct pstree_item *parent = me->parent; CgSetEntry *se; @@ -1284,13 +1318,7 @@ int prepare_task_cgroup(struct pstree_item *me) return -1; } - /* Since don't support nesting of cgroup namespaces, let's only set up - * the cgns (if it exists) in the init task. In the future, we should - * just check that the cgns prefix string matches for all the entries - * in the cgset, and only unshare if that's true. - */ - - return move_in_cgroup(se, !me->parent); + return move_in_cgroup(se); } void fini_cgroup(void) @@ -1309,34 +1337,6 @@ void fini_cgroup(void) cg_yard = NULL; } -static int restore_perms(int fd, const char *path, CgroupPerms *perms) -{ - struct stat sb; - - if (perms) { - if (fstat(fd, &sb) < 0) { - pr_perror("stat of property %s failed", path); - return -1; - } - - /* only chmod/chown if the perms are actually different: we aren't - * allowed to chmod some cgroup props (e.g. the read only ones), so we - * don't want to try if the perms already match. - */ - if (sb.st_mode != (mode_t)perms->mode && fchmod(fd, perms->mode) < 0) { - pr_perror("chmod of %s failed", path); - return -1; - } - - if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && fchown(fd, perms->uid, perms->gid)) { - pr_perror("chown of %s failed", path); - return -1; - } - } - - return 0; -} - static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) { char *current, *next; @@ -1434,7 +1434,7 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat return -1; } - if (restore_perms(fd, path, perms) < 0) + if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ @@ -1758,7 +1758,7 @@ static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { - int fd, ret; + int fd, ret = 0; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { @@ -1766,7 +1766,8 @@ static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) return -1; } - ret = restore_perms(fd, path, perms); + if (perms) + ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); close(fd); return ret; } diff --git a/criu/config.c b/criu/config.c index 9f02ae9928..961bc4367b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -650,6 +650,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "ms", no_argument, 0, 1054 }, BOOL_OPT("track-mem", &opts.track_mem), BOOL_OPT("auto-dedup", &opts.auto_dedup), + BOOL_OPT("skip-zero-pages", &opts.skip_zero_pages), { "libdir", required_argument, 0, 'L' }, { "cpu-cap", optional_argument, 0, 1057 }, BOOL_OPT("force-irmap", &opts.force_irmap), @@ -1036,6 +1037,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else if (!strcmp("nftables", optarg)) { opts.network_lock_method = NETWORK_LOCK_NFTABLES; + } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { + opts.network_lock_method = NETWORK_LOCK_SKIP; } else { pr_err("Invalid value for --network-lock: %s\n", optarg); return 1; diff --git a/criu/cr-check.c b/criu/cr-check.c index a4166f76ba..fea1ce674a 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1086,6 +1086,8 @@ static int kerndat_tcp_repair_window(void) int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0 && errno == EAFNOSUPPORT) + sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; @@ -1380,6 +1382,14 @@ static int check_ipv6_freebind(void) return 0; } +static int check_pagemap_scan(void) +{ + if (!kdat.has_pagemap_scan) + return -1; + + return 0; +} + static int (*chk_feature)(void); /* @@ -1500,6 +1510,7 @@ int cr_check(void) ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); + ret |= check_pagemap_scan(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1621,6 +1632,7 @@ static struct feature_list feature_list[] = { { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, + { "pagemap_scan", check_pagemap_scan }, { NULL, NULL }, }; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 90d763f497..ee5974acc9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -770,6 +770,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; + if (misc->membarrier_registration_mask) { + core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; + core->tc->has_membarrier_registration_mask = true; + } + ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; @@ -2180,6 +2185,9 @@ int cr_dump_tasks(pid_t pid) if (network_lock()) goto err; + if (rpc_query_external_files()) + goto err; + if (collect_file_locks()) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index f02e95f6d2..2700497216 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -863,6 +863,9 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; + if (tc->has_membarrier_registration_mask) + args->membarrier_registration_mask = tc->membarrier_registration_mask; + /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid); @@ -1827,6 +1830,13 @@ static int restore_task_with_children(void *_arg) /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; + + /* + * Since we don't support nesting of cgroup namespaces, let's + * only set up the cgns (if it exists) in the init task. + */ + if (prepare_cgroup_namespace(current) < 0) + goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) @@ -1838,7 +1848,7 @@ static int restore_task_with_children(void *_arg) * we will only move the root one there, others will * just have it inherited. */ - if (prepare_task_cgroup(current) < 0) + if (restore_task_cgroup(current) < 0) goto err; /* Restore root task */ @@ -2933,12 +2943,6 @@ static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry return ret; } -static inline int verify_cap_size(CredsEntry *ce) -{ - return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && (ce->n_cap_prm == CR_CAP_SIZE) && - (ce->n_cap_bnd == CR_CAP_SIZE)); -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; @@ -2964,7 +2968,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) args->fd_exe_link = exe_fd; - args->has_thp_enabled = rsti(current)->has_thp_enabled; + args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; ret = 0; out: @@ -3353,17 +3357,31 @@ static bool groups_match(gid_t *groups, int n_groups) return ret; } +static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) +{ + int i, cap_end; + + for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { + if (~in_caps[i / 32] & (1 << (i % 32))) + continue; + + pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); + /* extra caps will be cleared below */ + } + + n_words = min(n_words, (kdat.last_cap + 31) / 32); + cap_end = (kdat.last_cap & 31) + 1; + memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); + if ((cap_end & 31) && n_words) + out_caps[n_words - 1] &= (1 << cap_end) - 1; + memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); +} + static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; - if (!verify_cap_size(ce)) { - pr_err("Caps size mismatch %d %d %d %d\n", (int)ce->n_cap_inh, (int)ce->n_cap_eff, (int)ce->n_cap_prm, - (int)ce->n_cap_bnd); - return ERR_PTR(-EINVAL); - } - this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); @@ -3451,10 +3469,10 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.groups = NULL; args->creds.lsm_profile = NULL; - memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); - memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); - memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); - memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); + copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); + copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); + copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); + copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; diff --git a/criu/cr-service.c b/criu/cr-service.c index 314c309be9..19a1448efa 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -240,6 +240,49 @@ int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) return 0; } +int exec_rpc_query_external_files(char *name, int sk) +{ + int i, ret; + CriuNotify cn = CRIU_NOTIFY__INIT; + CriuResp msg = CRIU_RESP__INIT; + CriuReq *req; + + cn.script = name; + + msg.type = CRIU_REQ_TYPE__NOTIFY; + msg.success = true; + msg.notify = &cn; + + ret = send_criu_msg_with_fd(sk, &msg, -1); + if (ret < 0) + return ret; + + ret = recv_criu_msg(sk, &req); + if (ret < 0) + return ret; + + if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { + pr_err("RPC client reported script error\n"); + return -1; + } + + ret = 0; + if (req->opts) + for (i = 0; i < req->opts->n_external; i++) { + char *key = req->opts->external[i]; + pr_info("Adding external object: %s\n", key); + if (add_external(key)) { + pr_err("Failed to add external object: %s\n", key); + ret = -1; + } + } + else + pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); + + criu_req__free_unpacked(req, NULL); + return ret; +} + static char images_dir[PATH_MAX]; static int setup_opts_from_req(int sk, CriuOpts *req) @@ -339,8 +382,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req) */ if (imgs_changed_by_rpc_conf) strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else + else if (req->images_dir_fd != -1) sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + else if (req->images_dir) + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); @@ -394,6 +443,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; } else if (!opts.output) { SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); } @@ -428,6 +480,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; + if (req->has_leave_stopped && req->leave_stopped) + opts.final_state = TASK_STOPPED; + if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; @@ -486,6 +541,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_auto_dedup) opts.auto_dedup = req->auto_dedup; + if (req->has_skip_zero_pages) + opts.skip_zero_pages = req->skip_zero_pages; + if (req->has_force_irmap) opts.force_irmap = req->force_irmap; @@ -520,6 +578,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) case CRIU_NETWORK_LOCK_METHOD__NFTABLES: opts.network_lock_method = NETWORK_LOCK_NFTABLES; break; + case CRIU_NETWORK_LOCK_METHOD__SKIP: + opts.network_lock_method = NETWORK_LOCK_SKIP; + break; default: goto err; } @@ -720,6 +781,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->orphan_pts_master) opts.orphan_pts_master = true; + if (req->has_display_stats) + opts.display_stats = req->display_stats; + /* Evaluate additional configuration file a second time to overwrite * all RPC settings. */ if (req->config_file) { diff --git a/criu/crtools.c b/criu/crtools.c index 94657f4186..5de3ab724e 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -541,6 +541,7 @@ int main(int argc, char *argv[], char *envp[]) " pages images of previous dump\n" " when used on restore, as soon as page is restored, it\n" " will be punched from the image\n" + " --skip-zero-pages don't dump pages containing only zero bytes.\n" " --pre-dump-mode splice - parasite based pre-dumping (default)\n" " read - process_vm_readv syscall based pre-dumping\n" "\n" diff --git a/criu/files-reg.c b/criu/files-reg.c index ed8b9c8899..fc61493501 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -407,46 +407,24 @@ static int mklnk_ghost(char *path, GhostFileEntry *gfe) static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; - int ret = -1; - if (S_ISLNK(gfe->mode)) { - if (lchown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) + return -1; - /* - * We have no lchmod() function, and fchmod() will fail on - * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() - * function and flag AT_SYMLINK_NOFOLLOW described in - * man 2 fchmodat, but it is not currently implemented. %) - */ - } else { - if (chown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (!gfe->atim) + return 0; - if (chmod(path, gfe->mode)) { - pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); - goto err; - } - } + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; - if (gfe->atim) { - tv[0].tv_sec = gfe->atim->tv_sec; - tv[0].tv_usec = gfe->atim->tv_usec; - tv[1].tv_sec = gfe->mtim->tv_sec; - tv[1].tv_usec = gfe->mtim->tv_usec; - if (lutimes(path, tv)) { - pr_perror("Can't set access and modification times on ghost %s", path); - goto err; - } + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + return -1; } - ret = 0; -err: - return ret; + return 0; } static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) @@ -1672,22 +1650,10 @@ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, co */ static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) { - char buf[SELFMAG + 1]; - void *start_addr; + char *start_addr; size_t mapped_size; int ret = -1; - if (read(fd, buf, SELFMAG + 1) != SELFMAG + 1) - return -1; - - /* - * The first 4 bytes contain a magic number identifying the file as an - * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and - * ‘F’, respectively. These characters are together defined as ELFMAG. - */ - if (strncmp(buf, ELFMAG, SELFMAG)) - return -1; - /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore at most only the first 1 MB of the @@ -1695,16 +1661,25 @@ static int get_build_id(const int fd, const struct stat *fd_status, unsigned cha */ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); - if (start_addr == MAP_FAILED) { + if ((void*)start_addr == MAP_FAILED) { pr_warn("Couldn't mmap file with fd %d\n", fd); return -1; } - if (buf[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(start_addr, build_id, fd, mapped_size); - if (buf[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(start_addr, build_id, fd, mapped_size); + /* + * The first 4 bytes contain a magic number identifying the file as an + * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and + * ‘F’, respectively. These characters are together defined as ELFMAG. + */ + if (memcmp(start_addr, ELFMAG, SELFMAG)) + goto out; + if (start_addr[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); + if (start_addr[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); + +out: munmap(start_addr, mapped_size); return ret; } @@ -1818,7 +1793,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) } if (!skip_for_shell_job && mnt_is_overmounted(mi)) { - pr_err("Open files on overmounted mounts are not supported yet\n"); + pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", + p->mnt_id, p->fd, link->name + 1); return -1; } @@ -2527,9 +2503,10 @@ static int open_filemap(int pid, struct vma_area *vma) * using dup because dup returns a reference to the same struct file inside kernel, but we * cannot open a new FD. */ - ret = dup(plugin_fd); + ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { - ret = memfd_open(vma->vmfd, &flags); + if (!inherited_fd(vma->vmfd, &ret)) + ret = memfd_open(vma->vmfd, &flags, true); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } diff --git a/criu/img-streamer.c b/criu/img-streamer.c index 7e36eae012..305e6fae5e 100644 --- a/criu/img-streamer.c +++ b/criu/img-streamer.c @@ -12,6 +12,7 @@ #include "rst-malloc.h" #include "common/scm.h" #include "common/lock.h" +#include "action-scripts.h" /* * We use different path names for the dump and restore sockets because: @@ -49,10 +50,17 @@ static const char *socket_name_for_mode(int mode) int img_streamer_init(const char *image_dir, int mode) { struct sockaddr_un addr; + int pre_stream_ret; int sockfd; img_streamer_mode = mode; + pre_stream_ret = run_scripts(ACT_PRE_STREAM); + if (pre_stream_ret != 0) { + pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); + return -1; + } + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { pr_perror("Unable to instantiate UNIX socket"); diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index c2e8850aab..6a331a32f8 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -4,6 +4,7 @@ #include "asm/int.h" enum script_actions { + ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, @@ -16,6 +17,7 @@ enum script_actions { ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_STATUS_READY, + ACT_QUERY_EXT_FILES, ACT_MAX }; @@ -24,6 +26,8 @@ extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); +extern int rpc_query_external_files(void); +extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 93f61539cf..dc264032e8 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -9,7 +9,8 @@ struct parasite_dump_cgroup_args; extern u32 root_cg_set; int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); -int prepare_task_cgroup(struct pstree_item *); +int restore_task_cgroup(struct pstree_item *); +int prepare_cgroup_namespace(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index c7e98c756c..b3a477550d 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -67,6 +67,7 @@ struct cg_root_opt { enum NETWORK_LOCK_METHOD { NETWORK_LOCK_IPTABLES, NETWORK_LOCK_NFTABLES, + NETWORK_LOCK_SKIP, }; #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES @@ -156,6 +157,7 @@ struct cr_options { int track_mem; char *img_parent; int auto_dedup; + int skip_zero_pages; unsigned int cpu_cap; int force_irmap; char **exec_cmd; diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 69d670be93..fe75dfe860 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -19,6 +19,7 @@ enum faults { FI_HUGE_ANON_SHMEM_ID = 132, FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, + FI_DONT_USE_PAGEMAP_SCAN = 135, FI_MAX, }; diff --git a/criu/include/image.h b/criu/include/image.h index 5cb01bde20..9a275565f9 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -41,7 +41,7 @@ * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar - * the vDSO area, it might reqire additional memory + * the vDSO area, it might require additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 0b2f715f38..91dbd494b2 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -85,6 +85,8 @@ struct kerndat_s { bool has_ptrace_get_rseq_conf; struct __ptrace_rseq_configuration libc_rseq_conf; bool has_ipv6_freebind; + bool has_membarrier_get_registrations; + bool has_pagemap_scan; }; extern struct kerndat_s kdat; diff --git a/criu/include/log.h b/criu/include/log.h index 85e6dc2e72..cbed330076 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -60,6 +60,8 @@ void flush_early_log_buffer(int fd); #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) + #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ diff --git a/criu/include/magic.h b/criu/include/magic.h index 22d7218e45..0e8c37234e 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -29,7 +29,7 @@ /* * The magic-s below correspond to coordinates - * of various Russian towns in the NNNNEEEE form. + * of various towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ diff --git a/criu/include/mem.h b/criu/include/mem.h index 03574ea3d7..3618c9cc3b 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,6 +7,7 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" +#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -47,5 +48,6 @@ int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -bool should_dump_page(VmaEntry *vmae, u64 pme); + +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/memfd.h b/criu/include/memfd.h index 1b1dc79bbc..78d8100198 100644 --- a/criu/include/memfd.h +++ b/criu/include/memfd.h @@ -1,7 +1,9 @@ #ifndef __CR_MEMFD_H__ #define __CR_MEMFD_H__ +#include #include + #include "int.h" #include "common/config.h" @@ -12,7 +14,7 @@ extern int is_memfd(dev_t dev); extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); extern const struct fdtype_ops memfd_dump_ops; -extern int memfd_open(struct file_desc *d, u32 *fdflags); +extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); extern struct collect_image_info memfd_cinfo; extern struct file_desc *collect_memfd(u32 id); extern int apply_memfd_seals(void); diff --git a/criu/include/net.h b/criu/include/net.h index 0da4cad137..5e8a848620 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -50,7 +50,6 @@ extern int kerndat_has_newifindex(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); extern struct ns_id *net_get_root_ns(void); -extern int kerndat_nsid(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 1d8bbffaf6..875e69e560 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,10 +1,12 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ +#include #include #include "int.h" #include "common/list.h" +#include "pagemap_scan.h" struct vma_area; @@ -15,9 +17,15 @@ typedef struct { unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ + int fd; /* file to read PMs from */ + u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + + struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ + size_t regs_len; /* actual length of regs */ + size_t regs_max_len; /* maximum length of regs */ + size_t regs_idx; /* current index in the regs array */ } pmc_t; #define PMC_INIT \ @@ -26,7 +34,8 @@ typedef struct { } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); +extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h new file mode 100644 index 0000000000..0ad4c9bc0b --- /dev/null +++ b/criu/include/pagemap_scan.h @@ -0,0 +1,68 @@ +#ifndef __CR_PAGEMAP_SCAN_H__ +#define __CR_PAGEMAP_SCAN_H__ + +#ifndef PAGEMAP_SCAN +#include +#include "int.h" + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + u64 start; + u64 end; + u64 categories; +}; + +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + u64 size; + u64 flags; + u64 start; + u64 end; + u64 walk_end; + u64 vec; + u64 vec_len; + u64 max_pages; + u64 category_inverted; + u64 category_mask; + u64 category_anyof_mask; + u64 return_mask; +}; +#endif /* PAGEMAP_SCAN */ + +#endif /* __CR_PAGEMAP_SCAN_H__ */ diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 787c927be9..1244220f67 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -118,6 +118,8 @@ static inline int posix_timers_dump_size(int timer_n) */ struct parasite_dump_misc { + bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ + unsigned long brk; u32 pid; @@ -128,6 +130,7 @@ struct parasite_dump_misc { int dumpable; int thp_disabled; int child_subreaper; + int membarrier_registration_mask; }; /* @@ -148,6 +151,7 @@ struct parasite_dump_creds { int uids[4]; int gids[4]; + int no_new_privs; unsigned int secbits; unsigned int ngroups; /* diff --git a/criu/include/prctl.h b/criu/include/prctl.h index c843f40a75..4c2a548b16 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -30,6 +30,12 @@ #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif +#ifndef PR_GET_NO_NEW_PRIVS +#define PR_GET_NO_NEW_PRIVS 39 +#endif +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 diff --git a/criu/include/restorer.h b/criu/include/restorer.h index bc0beb5cbb..f398d8d8fe 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -72,7 +72,6 @@ struct thread_creds_args { u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; - unsigned int secbits; char *lsm_profile; unsigned int *groups; char *lsm_sockcreate; @@ -144,7 +143,7 @@ struct task_restore_args { struct timeval logstart; int uffd; - bool has_thp_enabled; + bool thp_disabled; /* threads restoration */ int nr_threads; /* number of threads */ @@ -230,6 +229,7 @@ struct task_restore_args { #endif int lsm_type; int child_subreaper; + int membarrier_registration_mask; bool has_clone3_set_tid; /* diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index d0a3db6c5d..704b42a727 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -73,8 +73,6 @@ struct rst_info { */ bool has_old_seccomp_filter; - bool has_thp_enabled; - struct rst_rseq *rseqe; void *breakpoint; diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 813ef630ef..15cab11464 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,13 +4,14 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" +#include "pagemap-cache.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 961d711ee7..b3a70fb27e 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -69,6 +69,7 @@ extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt +#define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { @@ -76,7 +77,7 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_err("Failed to turn off repair mode on socket\n"); + pr_perror("Failed to turn off repair mode on socket %d", fd); } extern void tcp_locked_conn_add(struct inet_sk_info *); diff --git a/criu/include/stats.h b/criu/include/stats.h index d8dd159989..841ebdfe01 100644 --- a/criu/include/stats.h +++ b/criu/include/stats.h @@ -33,6 +33,8 @@ enum { CNT_SHPAGES_SKIPPED_PARENT, CNT_SHPAGES_WRITTEN, + CNT_SKIPPED_ZERO_PAGES, + DUMP_CNT_NR_STATS, }; diff --git a/criu/include/util.h b/criu/include/util.h index 4b4dfda950..4334e69c2d 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -263,6 +263,10 @@ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); + int fd_has_data(int lfd); int make_yard(char *path); @@ -274,8 +278,6 @@ static inline int sk_wait_data(int sk) } void fd_set_nonblocking(int fd, bool on); -void tcp_nodelay(int sk, bool on); -void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); diff --git a/criu/include/vma.h b/criu/include/vma.h index 106c56af26..4b663ee500 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -122,8 +122,8 @@ static inline struct vma_area *vma_next(struct vma_area *vma) static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && - !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && - !(e->flags & MAP_HUGETLB)); + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && + !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); } #endif /* __CR_VMA_H__ */ diff --git a/criu/irmap.c b/criu/irmap.c index 7b9d77bc1f..37d098db11 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -67,6 +67,7 @@ static struct irmap hints[] = { .path = "/var/log", .nr_kids = -1, }, + { .path = "/usr/share/dbus-1/services", .nr_kids = -1 }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, @@ -101,7 +102,7 @@ static int irmap_update_stat(struct irmap *i) pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { - pr_perror("Can't stat %s", i->path); + pr_pwarn("Can't stat %s", i->path); return -1; } @@ -136,7 +137,7 @@ static int irmap_update_dir(struct irmap *t) pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { - pr_perror("Can't open %s", t->path); + pr_pwarn("Can't open %s", t->path); return -1; } @@ -501,6 +502,6 @@ int irmap_scan_path_add(char *path) o->ir->path = path; o->ir->nr_kids = -1; - list_add(&o->node, &opts.irmap_scan_paths); + list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; } diff --git a/criu/kerndat.c b/criu/kerndat.c index bc0c7ba05d..95e7226b2b 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -12,11 +12,12 @@ #include #include #include -#include /* for sockaddr_in and inet_ntoa() */ +#include #include #include #include #include +#include #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include @@ -53,12 +54,14 @@ #include "memfd.h" #include "mount-v2.h" #include "util-caps.h" +#include "pagemap_scan.h" struct kerndat_s kdat = {}; +volatile int dummy_var; static int check_pagemap(void) { - int ret, fd; + int ret, fd, retry; u64 pfn = 0; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); @@ -72,11 +75,44 @@ static int check_pagemap(void) return -1; } - /* Get the PFN of some present page. Stack is here, so try it :) */ - ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); - if (ret != sizeof(pfn)) { - pr_perror("Can't read pagemap"); + if (ioctl(fd, PAGEMAP_SCAN, NULL) == 0) { + pr_err("PAGEMAP_SCAN succeeded unexpectedly\n"); return -1; + } else { + switch (errno) { + case EFAULT: + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; + break; + case EINVAL: + case ENOTTY: + pr_debug("PAGEMAP_SCAN isn't supported\n"); + break; + default: + pr_perror("PAGEMAP_SCAN failed with unexpected errno"); + return -1; + } + } + + retry = 3; + while (retry--) { + ++dummy_var; + /* Get the PFN of a page likely to be present. */ + ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + close(fd); + return -1; + } + /* The page can be swapped out by the time the read occurs, + * in which case the rest of the bits are a swap type + offset + * (which could be zero even if not hidden). + * Retry if this happens. */ + if (pfn & PME_PRESENT) + break; + pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, + retry ? "retrying" : "giving up"); + pfn = 0; } close(fd); @@ -465,8 +501,15 @@ static int get_last_cap(void) struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; + int ret; - return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) + return ret; + + pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", + kdat.last_cap, 32 * CR_CAP_SIZE - 1); + return -1; } static bool kerndat_has_memfd_create(void) @@ -615,29 +658,52 @@ static int kerndat_iptables_has_xtlocks(void) return 0; } -int kerndat_tcp_repair(void) +/* + * Unfortunately in C htonl() is not constexpr and cannot be used in a static + * initialization below. + */ +#define constant_htonl(x) \ + (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ + (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) + +static int kerndat_tcp_repair(void) { + static const struct sockaddr_in loopback_ip4 = { + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, + }; + static const struct sockaddr_in6 loopback_ip6 = { + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; int sock, clnt = -1, yes = 1, exit_code = -1; - struct sockaddr_in addr; - socklen_t aux; + const struct sockaddr *addr; + struct sockaddr_storage listener_addr; + socklen_t addrlen; - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); - addr.sin_port = 0; + addr = (const struct sockaddr *)&loopback_ip4; + addrlen = sizeof(loopback_ip4); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) { + addr = (const struct sockaddr *)&loopback_ip6; + addrlen = sizeof(loopback_ip6); + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); + } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } - if (bind(sock, (struct sockaddr *)&addr, sizeof(addr))) { + if (bind(sock, addr, addrlen)) { pr_perror("Unable to bind a socket"); goto err; } - aux = sizeof(addr); - if (getsockname(sock, (struct sockaddr *)&addr, &aux)) { + addrlen = sizeof(listener_addr); + if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { pr_perror("Unable to get a socket name"); goto err; } @@ -647,13 +713,13 @@ int kerndat_tcp_repair(void) goto err; } - clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } - if (connect(clnt, (struct sockaddr *)&addr, sizeof(addr))) { + if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { pr_perror("Unable to connect a socket"); goto err; } @@ -680,20 +746,22 @@ int kerndat_tcp_repair(void) return exit_code; } -int kerndat_nsid(void) +static int kerndat_nsid(void) { int nsid, sk; + kdat.has_nsid = false; + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { - pr_perror("Unable to create a netlink socket"); - return -1; + pr_pwarn("Unable to create a netlink socket: NSID can't be used."); + return 0; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { - pr_err("NSID is not supported\n"); + pr_warn("NSID is not supported\n"); close(sk); - return -1; + return 0; } kdat.has_nsid = true; @@ -977,6 +1045,8 @@ int kerndat_sockopt_buf_lock(void) int sock; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("Unable to create a socket"); return -1; @@ -1324,6 +1394,8 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); + if (!kdat.has_thp_disable) + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } @@ -1367,17 +1439,20 @@ static bool kerndat_has_clone3_set_tid(void) */ pid = syscall(__NR_clone3, &args, sizeof(args)); - if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { - kdat.has_clone3_set_tid = false; - return 0; - } - if (pid == -1 && errno == EINVAL) { - kdat.has_clone3_set_tid = true; - } else { - pr_perror("Unexpected error from clone3"); + if (pid != -1) { + pr_err("Unexpected success: clone3() returned %d\n", pid); return -1; } + if (errno == ENOSYS || errno == E2BIG) + return 0; + + if (errno != EINVAL) { + pr_pwarn("Unexpected error from clone3"); + return 0; + } + + kdat.has_clone3_set_tid = true; return 0; } @@ -1545,17 +1620,10 @@ static int kerndat_has_nftables_concat(void) #define IPV6_FREEBIND 78 #endif -static int kerndat_has_ipv6_freebind(void) +static int __kerndat_has_ipv6_freebind(int sk) { - int sk, val; - - sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); - if (sk == -1) { - pr_perror("Unable to create a ipv6 dgram socket"); - return -1; - } + int val = 1; - val = 1; if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { if (errno == ENOPROTOOPT) { kdat.has_ipv6_freebind = false; @@ -1569,6 +1637,44 @@ static int kerndat_has_ipv6_freebind(void) return 0; } +static int kerndat_has_ipv6_freebind(void) +{ + int sk, ret; + + if (!kdat.ipv6) { + kdat.has_ipv6_freebind = false; + return 0; + } + + sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (sk == -1) { + pr_perror("Unable to create a ipv6 dgram socket"); + return -1; + } + + ret = __kerndat_has_ipv6_freebind(sk); + close(sk); + return ret; +} + +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int kerndat_has_membarrier_get_registrations(void) +{ + int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); + if (ret < 0) { + if (errno != EINVAL) { + return ret; + } + + kdat.has_membarrier_get_registrations = false; + } else { + kdat.has_membarrier_get_registrations = true; + } + + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1812,6 +1918,10 @@ int kerndat_init(void) pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_membarrier_get_registrations()) { + pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/libnetlink.c b/criu/libnetlink.c index f0304b0dbc..c7a84a44d3 100644 --- a/criu/libnetlink.c +++ b/criu/libnetlink.c @@ -214,8 +214,3 @@ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], in return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } - -int32_t nla_get_s32(const struct nlattr *nla) -{ - return *(const int32_t *)nla_data(nla); -} diff --git a/criu/mem.c b/criu/mem.c index ab86a1f6d7..6ac7adb6b8 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -3,8 +3,10 @@ #include #include #include +#include #include #include +#include #include "types.h" #include "cr_options.h" @@ -31,6 +33,7 @@ #include "prctl.h" #include "compel/infect-util.h" #include "pidfd-store.h" +#include "xmalloc.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" @@ -99,7 +102,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -bool should_dump_page(VmaEntry *vmae, u64 pme) +static bool should_dump_entire_vma(VmaEntry *vmae) { /* * vDSO area must be always dumped because on restore @@ -107,30 +110,53 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vmae, VMA_AREA_VVAR)) - return false; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) - return true; return false; } +/* + * should_dump_page returns vaddr if an addressed page has to be dumped. + * Otherwise, it returns an address that has to be inspected next. + */ +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +{ + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) + return -1; + + if (pmc->regs) { + while (1) { + if (pmc->regs_idx == pmc->regs_len) + return pmc->end; + if (vaddr < pmc->regs[pmc->regs_idx].end) + break; + pmc->regs_idx++; + } + if (vaddr < pmc->regs[pmc->regs_idx].start) + return pmc->regs[pmc->regs_idx].start; + if (softdirty) + *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + return vaddr; + } else { + u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) + return vaddr + PAGE_SIZE; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + if (softdirty) + *softdirty = pme & PME_SOFT_DIRTY; + return vaddr; + } + + return vaddr + PAGE_SIZE; + } +} + bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -161,28 +187,71 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but - * the memory contents is present in the pagent image set. + * the memory contents is present in the parent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, bool has_parent) { - u64 *at = &map[PAGE_PFN(*off)]; - unsigned long pfn, nr_to_scan; - unsigned long pages[3] = {}; + unsigned long nr_scanned; + /* Counters for PAGES_SKIPPED_PARENT, PAGES_LAZY, PAGES_WRITTEN and SKIPPED_ZERO_PAGES */ + unsigned long pages[4] = {}; + unsigned long vaddr; + bool dump_all_pages; int ret = 0; - nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + static char *zero_page = NULL; + static char *remote_page = NULL; + int zero = 0; + struct iovec local[2]; + struct iovec remote[1]; + int nread = 0; + if (opts.skip_zero_pages && zero_page == NULL) { + zero_page = xmalloc(PAGE_SIZE); + remote_page = xmalloc(PAGE_SIZE); + if (zero_page == NULL || remote_page == NULL) { + pr_warn("Can't allocate memory - disabling --skip-zero-pages\n"); + opts.skip_zero_pages = 0; + } else { + memzero(zero_page, PAGE_SIZE); + local[0].iov_base = remote_page; + local[0].iov_len = PAGE_SIZE; + remote[0].iov_base = (void *)0x0; + remote[0].iov_len = PAGE_SIZE; + } + } - for (pfn = 0; pfn < nr_to_scan; pfn++) { - unsigned long vaddr; + dump_all_pages = should_dump_entire_vma(vma->e); + + nr_scanned = 0; + for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; + bool softdirty = false; + u64 next; int st; - if (!should_dump_page(vma->e, at[pfn])) - continue; - - vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + /* If dump_all_pages is true, should_dump_page is called to get pme. */ + next = should_dump_page(pmc, vma->e, vaddr, &softdirty); + if (!dump_all_pages) { + if (next != vaddr) { + vaddr = next - PAGE_SIZE; + continue; + } else if (opts.skip_zero_pages) { + remote[0].iov_base = (void *)vaddr; + nread = process_vm_readv(item->pid->real, local, 1, remote, 1, 0); + if (nread == PAGE_SIZE) { + zero = memcmp(zero_page, remote_page, PAGE_SIZE); + /* + * If the page contains just zeros we can treat it like the zero page and skip it. + * At restore it will be replaced by a reference to the zero page and COWed if accessed. + */ + if (zero == 0) { + pages[3]++; + continue; + } + } + } + } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -194,7 +263,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + if (has_parent && page_in_parent(softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -214,14 +283,15 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *off += pfn * PAGE_SIZE; - - cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + *pvaddr = vaddr; + cnt_add(CNT_PAGES_SCANNED, nr_scanned); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); + cnt_add(CNT_SKIPPED_ZERO_PAGES, pages[3]); - pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes\n", pages[2] + pages[1], pages[1], pages[0]); + pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes %lu skipped zero\n", + pages[2] + pages[1], pages[1], pages[0], pages[3]); return ret; } @@ -356,12 +426,20 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 off = 0; - u64 *map; + u64 vaddr; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vma->e, VMA_AREA_VVAR)) + return 0; /* * To facilitate any combination of pre-dump modes to run after @@ -421,15 +499,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - map = pmc_get_map(pmc, vma); - if (!map) + if (pmc_get_map(pmc, vma)) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, map); - + return add_shmem_area(item->pid->real, vma->e, pmc); + vaddr = vma->e->start; again: - ret = generate_iovs(item, vma, pp, map, &off, has_parent); + ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); @@ -1217,8 +1294,6 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { - MmEntry *mm = rsti(t)->mm; - /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until @@ -1241,8 +1316,6 @@ static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) pr_perror("Cannot disable THP"); return -1; } - if (!(mm->has_thp_disabled && mm->thp_disabled)) - rsti(t)->has_thp_enabled = true; return 0; } diff --git a/criu/memfd.c b/criu/memfd.c index da29377034..9d9f0621fc 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -46,6 +46,7 @@ struct memfd_restore_inode { int fdstore_id; unsigned int pending_seals; MemfdInodeEntry *mie; + bool was_opened_rw; }; static LIST_HEAD(memfd_inodes); @@ -91,10 +92,21 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * mie.has_hugetlb_flag = true; mie.hugetlb_flag = flag | MFD_HUGETLB; } + mie.mode = st->st_mode; + mie.has_mode = true; mie.seals = fcntl(fd, F_GET_SEALS); - if (mie.seals == -1) - goto out; + if (mie.seals == -1) { + if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { + pr_perror("fcntl(F_GET_SEALS)"); + goto out; + } + /* Kernels before 4.16 don't allow MFD_HUGETLB | + * MFD_ALLOW_SEALING and return EINVAL for + * fcntl(MFD_HUGETLB-enabled fd). + */ + mie.seals = F_SEAL_SEAL; + } if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) goto out; @@ -222,6 +234,7 @@ static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_im mutex_init(&inode->lock); inode->fdstore_id = -1; inode->pending_seals = 0; + inode->was_opened_rw = false; list_add_tail(&inode->list, &memfd_inodes); @@ -270,8 +283,13 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (fchown(fd, mie->uid, mie->gid)) { - pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); + if (mie->has_mode) + ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); + else + ret = cr_fchown(fd, mie->uid, mie->gid); + if (ret) { + pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, + (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); goto out; } @@ -305,7 +323,7 @@ static int memfd_open_inode(struct memfd_restore_inode *inode) return fd; } -int memfd_open(struct file_desc *d, u32 *fdflags) +int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) { struct memfd_info *mfi; MemfdFileEntry *mfe; @@ -315,57 +333,80 @@ int memfd_open(struct file_desc *d, u32 *fdflags) mfi = container_of(d, struct memfd_info, d); mfe = mfi->mfe; - if (inherited_fd(d, &fd)) - return fd; - pr_info("Restoring memfd id=%d\n", mfe->id); fd = memfd_open_inode(mfi->inode); if (fd < 0) - goto err; + return -1; /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; + + if (filemap && (flags & O_ACCMODE) == O_RDWR) + return fd; + + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { + /* + * If there is only a single RW-opened fd for a memfd, it can + * be used to pass it to execveat() with AT_EMPTY_PATH to have + * its contents executed. This currently works only for the + * original fd from memfd_create() so return the original fd + * once -- in case the caller expects to be the sole opener + * and does execveat() from this memfd. + */ + if (!fcntl(fd, F_SETFL, flags)) { + mfi->inode->was_opened_rw = true; + return fd; + } + + pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); + } + /* * Ideally we should call compat version open() to not force the * O_LARGEFILE file flag with regular open(). It doesn't seem that * important though. */ _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); - if (_fd < 0) { + if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); - goto err; - } + else if (!filemap && (flags & O_ACCMODE) == O_RDWR) + pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); + close(fd); - fd = _fd; + return _fd; +} + +static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) +{ + MemfdFileEntry *mfe; + int fd; + + if (inherited_fd(d, new_fd)) + return 0; + + fd = memfd_open(d, NULL, false); + if (fd < 0) + return -1; + + mfe = container_of(d, struct memfd_info, d)->mfe; if (restore_fown(fd, mfe->fown) < 0) goto err; if (lseek(fd, mfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file position of memfd id=%d", mfe->id); + pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); goto err; } - return fd; + *new_fd = fd; + return 0; err: - if (fd >= 0) - close(fd); + close(fd); return -1; } -static int memfd_open_fe_fd(struct file_desc *fd, int *new_fd) -{ - int tmp; - - tmp = memfd_open(fd, NULL); - if (tmp < 0) - return -1; - *new_fd = tmp; - return 0; -} - static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) { MemfdInodeEntry *mie = NULL; diff --git a/criu/mount.c b/criu/mount.c index db9db63b27..afbd242810 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -1197,8 +1197,8 @@ int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinf dev == pm->s_dev_rt) return 0; - pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, - pm->fstype->name, pm->ns_mountpoint); + pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); return -1; } @@ -1239,12 +1239,16 @@ int __open_mountpoint(struct mount_info *pm) int open_mount(unsigned int s_dev) { struct mount_info *m; + int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; - return __open_mountpoint(m); + mnt_fd = __open_mountpoint(m); + if (mnt_fd < 0) + pr_err("Can't open mount %#x\n", s_dev); + return mnt_fd; } /* Bind-mount a mount point in a temporary place without children */ @@ -2823,7 +2827,7 @@ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *mi; /* child is remapped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; diff --git a/criu/namespaces.c b/criu/namespaces.c index b1b5303fa5..b7c0ab4008 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1454,7 +1454,7 @@ int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) * each other easily. Stream socket require manual * messages boundaries. * - * b) Make callers note the damon death by seeing the + * b) Make callers note the daemon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. diff --git a/criu/net.c b/criu/net.c index 2793b18e66..0f7280bb50 100644 --- a/criu/net.c +++ b/criu/net.c @@ -111,15 +111,18 @@ int read_ns_sys_file(char *path, char *buf, int len) } rlen = read(fd, buf, len); + if (rlen == -1) + pr_perror("Can't read ns' %s", path); close(fd); if (rlen == len) { + buf[0] = '\0'; pr_err("Too small buffer to read ns sys file %s\n", path); return -1; } - if (rlen > 0) - buf[rlen - 1] = '\0'; + if (rlen >= 0) + buf[rlen] = '\0'; return rlen; } @@ -2435,27 +2438,39 @@ static inline int do_restore_nftables(struct cr_img *img) off_t img_data_size; char *buf; - if ((img_data_size = img_raw_size(img)) < 0) + if ((img_data_size = img_raw_size(img)) < 0) { + pr_err("image size mismatch\n"); goto out; + } - if (read_img_str(img, &buf, img_data_size) < 0) + if (read_img_str(img, &buf, img_data_size) < 0) { + pr_err("Failed to read nftables data\n"); goto out; + } nft = nft_ctx_new(NFT_CTX_DEFAULT); - if (!nft) + if (!nft) { + pr_err("Failed to create nft context object\n"); goto buf_free_out; + } + + if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft)) { + pr_err("Failed to enable std/err output buffering\n"); + goto nft_ctx_free_out; + } - if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft) || #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) - nft_run_cmd_from_buffer(nft, buf, strlen(buf))) + if (nft_run_cmd_from_buffer(nft, buf, strlen(buf))) #elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) - nft_run_cmd_from_buffer(nft, buf)) + if (nft_run_cmd_from_buffer(nft, buf)) #else - { - BUILD_BUG_ON(1); - } + BUILD_BUG_ON(1); #endif + { + pr_err("nft command error:\n%s\n%s\n", + nft_ctx_get_error_buffer(nft), buf); goto nft_ctx_free_out; + } exit_code = 0; @@ -3131,6 +3146,9 @@ int network_lock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3172,19 +3190,53 @@ static inline int nftables_network_unlock(void) #endif } +static int iptables_has_criu_jump_target(void) +{ + int fd, ret; + char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fd = -1; + pr_perror("failed to open /dev/null, using log fd"); + } + + ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); + close_safe(&fd); + return ret; +} + static int iptables_network_unlock_internal(void) { - char conf[] = "*filter\n" - ":CRIU - [0:0]\n" - "-D INPUT -j CRIU\n" - "-D OUTPUT -j CRIU\n" - "-X CRIU\n" - "COMMIT\n"; + char delete_jump_targets[] = "*filter\n" + ":CRIU - [0:0]\n" + "-D INPUT -j CRIU\n" + "-D OUTPUT -j CRIU\n" + "COMMIT\n"; + + char delete_criu_chain[] = "*filter\n" + ":CRIU - [0:0]\n" + "-X CRIU\n" + "COMMIT\n"; + int ret = 0; - ret |= iptables_restore(false, conf, sizeof(conf) - 1); + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); if (kdat.ipv6) - ret |= iptables_restore(true, conf, sizeof(conf) - 1); + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + + /* For compatibility with iptables-nft backend, we need to make sure that all jump + * targets have been removed before deleting the CRIU chain. + */ + if (!iptables_has_criu_jump_target()) { + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + } + + ret |= iptables_restore(false, delete_criu_chain, sizeof(delete_criu_chain) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_criu_chain, sizeof(delete_criu_chain) - 1); return ret; } @@ -3193,6 +3245,9 @@ static int network_unlock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3265,7 +3320,7 @@ int macvlan_ext_add(struct external *ext) /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the - * kernel will synchonously go on a very slow routine called + * kernel will synchronously go on a very slow routine called * synchronize_rcu() trying to put a reference on old namespaces. * * To avoid doing this more than once we pre-create all the @@ -3433,7 +3488,7 @@ struct ns_id *net_get_root_ns(void) /* * socket_diag doesn't report unbound and unconnected sockets, - * so we have to get their network namesapces explicitly + * so we have to get their network namespaces explicitly */ struct ns_id *get_socket_ns(int lfd) { diff --git a/criu/netfilter.c b/criu/netfilter.c index 2212fd9f23..9e78dc4b03 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -48,8 +48,8 @@ void preload_netfilter_modules(void) fd = -1; pr_perror("failed to open /dev/null, using log fd for net module preload"); } - cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, 0); - cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, 0); + cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, CRS_CAN_FAIL); + cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, CRS_CAN_FAIL); close_safe(&fd); } diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 54dc3ccc41..aab6742be7 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -99,6 +99,7 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl { struct page_pipe_buf *prev = pp_prev_ppb(pp, ppb_flags); struct page_pipe_buf *ppb; + int ppb_size = 0; ppb = xmalloc(sizeof(*ppb)); if (!ppb) @@ -120,7 +121,13 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl cnt_add(CNT_PAGE_PIPES, 1); ppb->pipe_off = 0; - ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; + ppb_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0); + if (ppb_size < 0) { + xfree(ppb); + pr_perror("Can't get pipe size"); + return NULL; + } + ppb->pipe_size = ppb_size / PAGE_SIZE; pp->nr_pipes++; } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 782d4cafce..94f4774148 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,20 @@ static inline int send_psi(int sk, struct page_server_iov *pi) return send_psi_flags(sk, pi, 0); } +static void tcp_cork(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_CORK=%d", val); +} + +static void tcp_nodelay(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_NODELAY=%d", val); +} + /* page-server xfer */ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long len) { diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 00f088ff3f..978a6b1aca 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -1,5 +1,6 @@ #include #include +#include #include "page.h" #include "pagemap-cache.h" @@ -10,6 +11,7 @@ #include "vma.h" #include "mem.h" #include "kerndat.h" +#include "fault-injection.h" #undef LOG_PREFIX #define LOG_PREFIX "pagemap-cache: " @@ -22,6 +24,8 @@ #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) +#define PAGE_REGIONS_MAX_NR 32768 + /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. @@ -50,10 +54,23 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; - - pmc->map = xmalloc(pmc->map_len); - if (!pmc->map) - goto err; + pmc->regs_max_len = PAGE_PFN(map_size); + if (pmc->regs_max_len > PAGE_REGIONS_MAX_NR) + pmc->regs_max_len = PAGE_REGIONS_MAX_NR; + pmc->regs_len = 0; + pmc->regs_idx = 0; + pmc->regs = NULL; + pmc->map = NULL; + + if (kdat.has_pagemap_scan && !fault_injected(FI_DONT_USE_PAGEMAP_SCAN)) { + pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); + if (!pmc->regs) + goto err; + } else { + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; + } if (pagemap_cache_disabled) pr_warn_once("The pagemap cache is disabled\n"); @@ -87,17 +104,11 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz return -1; } -static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) -{ - return &pmc->map[PAGE_PFN(addr - pmc->start)]; -} - static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); - size_t size_map; if (high > kdat.task_size) high = kdat.task_size; @@ -115,7 +126,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) * fit in solid manner, iow -- either the whole vma fits * the cache window, either plain read is used. * - * The benefit (apart redusing the number of read() calls) + * The benefit (apart reducing the number of read() calls) * is to walk page tables less. */ if (!pagemap_cache_disabled && len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { @@ -149,39 +160,79 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) pr_debug("\t%d: simple mode [l:%lx h:%lx]\n", pmc->pid, pmc->start, pmc->end); } + return pmc_fill(pmc, pmc->start, pmc->end); +} + +int pmc_fill(pmc_t *pmc, u64 start, u64 end) +{ + size_t size_map; + + pmc->start = start; + pmc->end = end; + size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + if (pmc->regs) { + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = pmc->start, + .end = pmc->end, + .vec = (long)pmc->regs, + .vec_len = pmc->regs_max_len, + .max_pages = 0, + /* + * Request pages that are in RAM or swap, excluding + * zero-filled and file-backed pages. + */ + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; + long ret; + + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); + if (ret == -1) { + pr_perror("PAGEMAP_SCAN"); + pmc_zap(pmc); + return -1; + } + pmc->regs_len = ret; + pmc->regs_idx = 0; + pmc->end = args.walk_end; + } else { + if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } } return 0; } -u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +int pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) - return __pmc_get_map(pmc, vma->e->start); + return 0; /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); - return NULL; + return -1; } - - /* Hit for sure */ - return __pmc_get_map(pmc, vma->e->start); + return 0; } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); + xfree(pmc->regs); pmc_reset(pmc); } diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 35489634d9..295e404ec5 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -115,6 +115,10 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + if (c->no_new_privs > 0) { + ce->no_new_privs = c->no_new_privs; + ce->has_no_new_privs = true; + } ce->secbits = c->secbits; ce->n_groups = c->ngroups; @@ -429,6 +433,7 @@ int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_mis struct parasite_dump_misc *ma; ma = compel_parasite_args(ctl, struct parasite_dump_misc); + ma->has_membarrier_get_registrations = kdat.has_membarrier_get_registrations; if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) return -1; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 2303f41c39..e151ed6563 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -211,6 +211,63 @@ static int dump_thread_common(struct parasite_dump_thread *ti) return ret; } +/* + * Returns a membarrier() registration command (it is a bitmask) if the process + * was registered for specified (as a bit index) membarrier()-issuing command; + * returns zero otherwise. + */ +static int get_membarrier_registration_mask(int cmd_bit) +{ + unsigned cmd = 1 << cmd_bit; + int ret; + + /* + * Issuing a barrier will be successful only if the process was registered + * for this type of membarrier. All errors are a sign that the type issued + * was not registered (EPERM) or not supported by kernel (EINVAL or ENOSYS). + */ + ret = sys_membarrier(cmd, 0, 0); + if (ret && ret != -EPERM && ret != -EINVAL && ret != -ENOSYS) { + pr_err("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + return -1; + } + pr_debug("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + /* + * For supported registrations, MEMBARRIER_CMD_REGISTER_xxx = MEMBARRIER_CMD_xxx << 1. + * See: enum membarrier_cmd in include/uapi/linux/membarrier.h in kernel sources. + */ + return ret ? 0 : cmd << 1; +} + +/* + * It would be better to check the following with BUILD_BUG_ON, but we might + * have an old linux/membarrier.h header without necessary enum values. + */ +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int dump_membarrier_compat(int *membarrier_registration_mask) +{ + int ret; + + *membarrier_registration_mask = 0; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + return 0; +} + static int dump_misc(struct parasite_dump_misc *args) { int ret; @@ -225,6 +282,19 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (args->has_membarrier_get_registrations) { + ret = sys_membarrier(1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + pr_err("membarrier(1 << %d) returned %d\n", MEMBARRIER_CMDBIT_GET_REGISTRATIONS, ret); + return -1; + } + args->membarrier_registration_mask = ret; + } else { + ret = dump_membarrier_compat(&args->membarrier_registration_mask); + if (ret) + return ret; + } + ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) pr_err("PR_GET_CHILD_SUBREAPER failed (%d)\n", ret); @@ -268,6 +338,7 @@ static int dump_creds(struct parasite_dump_creds *args) } } + args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5e78e74d4f..02971657ef 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -51,6 +51,17 @@ #include "shmem.h" #include "restorer.h" +/* + * sys_getgroups() buffer size. Not too much, to avoid stack overflow. + */ +#define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) + +/* + * Memory overhead limit for reading VMA when auto_dedup is enabled. + * An arbitrarily chosen trade-off point between speed and memory usage. + */ +#define AUTO_DEDUP_OVERHEAD_BYTES (128 << 20) + #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 #endif @@ -93,7 +104,7 @@ bool fault_injected(enum faults f) * Hint: compel on aarch64 shall learn relocs for that. */ static unsigned __page_size; -unsigned page_size(void) +unsigned long page_size(void) { return __page_size; } @@ -191,22 +202,41 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; - - /* - * We're still root here and thus can do it without failures. - */ + int ruid, euid, suid, fsuid; + int rgid, egid, sgid, fsgid; /* * Setup supplementary group IDs early. */ if (args->groups) { - ret = sys_setgroups(ce->n_groups, args->groups); - if (ret) { - pr_err("Can't setup supplementary group IDs: %d\n", ret); - return -1; + /* + * We may be in an unprivileged user namespace where setgroups + * is disabled. If the current list of groups is already what + * we want, skip the call to setgroups. + */ + unsigned int gids[MAX_GETGROUPS_CHECKED]; + int n = sys_getgroups(MAX_GETGROUPS_CHECKED, gids); + if (n != ce->n_groups || memcmp(gids, args->groups, n * sizeof(*gids))) { + ret = sys_setgroups(ce->n_groups, args->groups); + if (ret) { + pr_err("Can't setgroups([%zu gids]): %d\n", ce->n_groups, ret); + return -1; + } } } + /* + * Compare xids with current values. If all match then we can skip + * setting them (which requires extra capabilities). + */ + fsuid = sys_setfsuid(-1); + fsgid = sys_setfsgid(-1); + if (sys_getresuid(&ruid, &euid, &suid) == 0 && sys_getresgid(&rgid, &egid, &sgid) == 0 && ruid == ce->uid && + euid == ce->euid && suid == ce->suid && rgid == ce->gid && egid == ce->egid && sgid == ce->sgid && + fsuid == ce->fsuid && fsgid == ce->fsgid) { + goto skip_xids; + } + /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. @@ -250,12 +280,13 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } +skip_xids: /* * Third -- restore securebits. We don't need them in any * special state any longer. */ - if (!uid) { + if (sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0) != ce->secbits) { ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); @@ -276,10 +307,18 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); - if (ret) { + if (!ret) + continue; + if (!ce->has_no_new_privs || !ce->no_new_privs || args->cap_prm[b] & (1 << i)) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } + /* + * If prctl(NO_NEW_PRIVS) is going to be set then it + * will prevent inheriting the capabilities not in + * the permitted set. + */ + pr_warn("Unable to drop capability %d from bset: %d (but NO_NEW_PRIVS will drop it)\n", i + b * 32, ret); } } @@ -320,6 +359,14 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ if (lsm_set_label(args->lsm_sockcreate, "sockcreate", procfd) < 0) return -1; + if (ce->has_no_new_privs && ce->no_new_privs) { + ret = sys_prctl(PR_SET_NO_NEW_PRIVS, ce->no_new_privs, 0, 0, 0); + if (ret) { + pr_err("Unable to set no_new_privs=%d: %d\n", ce->no_new_privs, ret); + return -1; + } + } + return 0; } @@ -688,7 +735,7 @@ static int recv_cg_set_restore_ack(int sk) * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. */ -long __export_restore_thread(struct thread_restore_args *args) +__visible long __export_restore_thread(struct thread_restore_args *args) { struct rt_sigframe *rt_sigframe; k_rtsigset_t to_block; @@ -1068,7 +1115,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) * |G|----tgt----| | * * 3. remap src to any other place. - * G prevents src from being remaped on tgt again + * G prevents src from being remapped on tgt again * | |-------------| -> |+++++src+++++| * |G|---tgt-----| | * @@ -1169,7 +1216,7 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { int ret, i; - kernel_timer_t next_id; + kernel_timer_t next_id = 0, timer_id; struct sigevent sev; for (i = 0; i < args->posix_timers_n; i++) { @@ -1183,25 +1230,26 @@ static int create_posix_timers(struct task_restore_args *args) sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; while (1) { - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); return ret; } - if (next_id == args->posix_timers[i].spt.it_id) + if (timer_id != next_id) { + pr_err("Can't create timers, kernel don't give them consequently\n"); + return -1; + } + next_id++; + + if (timer_id == args->posix_timers[i].spt.it_id) break; - ret = sys_timer_delete(next_id); + ret = sys_timer_delete(timer_id); if (ret < 0) { - pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); + pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); return ret; } - - if ((long)next_id > args->posix_timers[i].spt.it_id) { - pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; - } } } @@ -1228,7 +1276,7 @@ unsigned long vdso_rt_size = 0; void *bootstrap_start = NULL; unsigned int bootstrap_len = 0; -void __export_unmap(void) +__visible void __export_unmap(void) { sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } @@ -1435,6 +1483,40 @@ static int fd_poll(int inotify_fd) return sys_ppoll(&pfd, 1, &tmo, NULL, sizeof(sigset_t)); } +/* + * Call preadv() but limit size of the read. Zero `max_to_read` skips the limit. + */ +static ssize_t preadv_limited(int fd, struct iovec *iovs, int nr, off_t offs, size_t max_to_read) +{ + size_t saved_last_iov_len = 0; + ssize_t ret; + + if (max_to_read) { + for (int i = 0; i < nr; ++i) { + if (iovs[i].iov_len <= max_to_read) { + max_to_read -= iovs[i].iov_len; + continue; + } + + if (!max_to_read) { + nr = i; + break; + } + + saved_last_iov_len = iovs[i].iov_len; + iovs[i].iov_len = max_to_read; + nr = i + 1; + break; + } + } + + ret = sys_preadv(fd, iovs, nr, offs); + if (saved_last_iov_len) + iovs[nr - 1].iov_len = saved_last_iov_len; + + return ret; +} + /* * In the worst case buf size should be: * sizeof(struct inotify_event) * 2 + PATH_MAX @@ -1495,6 +1577,30 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; } +/* + * Restore membarrier() registrations. + */ +static int restore_membarrier_registrations(int mask) +{ + unsigned long bitmap[1] = { mask }; + int i, err, ret = 0; + + if (!mask) + return 0; + + pr_info("Restoring membarrier() registrations %x\n", mask); + + for_each_bit(i, bitmap) { + err = sys_membarrier(1 << i, 0, 0); + if (!err) + continue; + pr_err("Can't restore membarrier(1 << %d) registration: %d\n", i, err); + ret = -1; + } + + return ret; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1502,7 +1608,7 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) * and jump execution to some predefined ip read from * core file. */ -long __export_restore_task(struct task_restore_args *args) +__visible long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; @@ -1634,17 +1740,13 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } - if (args->uffd > -1) { - /* re-enable THP if we disabled it previously */ - if (args->has_thp_enabled) { - int ret; - ret = sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0); - if (ret) { - pr_err("Cannot re-enable THP: %d\n", ret); - goto core_restore_end; - } - } + ret = sys_prctl(PR_SET_THP_DISABLE, args->thp_disabled, 0, 0, 0); + if (ret) { + pr_err("Cannot restore THP_DISABLE=%d flag: %ld\n", args->thp_disabled, ret); + goto core_restore_end; + } + if (args->uffd > -1) { pr_debug("lazy-pages: closing uffd %d\n", args->uffd); /* * All userfaultfd configuration has finished at this point. @@ -1686,7 +1788,12 @@ long __export_restore_task(struct task_restore_args *args) while (nr) { pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); + /* + * If we're requested to punch holes in the file after reading we do + * it to save memory. Limit the reads then to an arbitrary block size. + */ + r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, + args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); if (r < 0) { pr_err("Can't read pages data (%d)\n", (int)r); goto core_restore_end; @@ -1985,6 +2092,9 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } + if (restore_membarrier_registrations(args->membarrier_registration_mask) < 0) + goto core_restore_end; + pr_info("%ld: Restored\n", sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); diff --git a/criu/pipes.c b/criu/pipes.c index 43ff06e3d8..daada88306 100644 --- a/criu/pipes.c +++ b/criu/pipes.c @@ -434,7 +434,7 @@ int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms /* steal_pipe has to be able to fit all data from a target pipe */ if (fcntl(steal_pipe[1], F_SETPIPE_SZ, pipe_size) < 0) { pr_perror("Unable to set a pipe size"); - goto err; + goto err_close; } bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 5e96b5c963..2b94050350 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -338,7 +338,7 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("Can't open mapped [%s]", fname); - goto returnerr; + return -1; } if (vma_stat(vma, fd)) { @@ -379,7 +379,6 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); closefd: close(fd); -returnerr: return -1; } @@ -842,6 +841,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du goto err; } + pr_debug("Handling VMA with the following smaps entry: %s\n", str); if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; @@ -1407,7 +1407,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -1972,10 +1972,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) " pos:%lli ino:%lx sdev:%x", &e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos, (long *)&e->inode, &e->dev); - if (ret < 3 || ret > 6) { - eventpoll_tfd_entry__free_unpacked(e, NULL); - goto parse_err; - } else if (ret == 3) { + if (ret == 3) { e->has_dev = false; e->has_inode = false; e->has_pos = false; @@ -1983,7 +1980,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) e->has_dev = true; e->has_inode = true; e->has_pos = true; - } else if (ret < 6) { + } else { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } @@ -2208,10 +2205,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } diff --git a/criu/shmem.c b/criu/shmem.c index c13a39b660..9e3178352d 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,23 +206,28 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) +static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; + u64 vaddr; if (!is_shmem_tracking_en()) return; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); - for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { - if (!should_dump_page(vma, map[vma_pfn])) + for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { + bool softdirty = false; + u64 next; + + next = should_dump_page(pmc, vma, vaddr, &softdirty); + if (next != vaddr) { + vaddr = next - PAGE_SIZE; continue; + } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (map[vma_pfn] & PME_SOFT_DIRTY) + if (softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); - else if (page_is_zero(map[vma_pfn])) - set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } @@ -648,7 +653,7 @@ static int open_shmem(int pid, struct vma_area *vma) return -1; } -int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) +int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); @@ -662,7 +667,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } @@ -679,7 +684,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 4bd5abff17..24e92a8521 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -416,9 +416,11 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io } else { ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); } ioe->has_freebind = ioe->freebind; ioe->has_pktinfo = !!ioe->pktinfo; + ioe->has_tos = !!ioe->tos; return ret; } @@ -813,6 +815,8 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); if (ioe->has_pktinfo) ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + if (ioe->has_tos) + ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); } if (ioe->raw) diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 96d5d13bf6..630a182a27 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -39,6 +39,8 @@ static int lock_connection(struct inet_sk_desc *sk) return iptables_lock_connection(sk); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) return nftables_lock_connection(sk); + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -50,6 +52,8 @@ static int unlock_connection(struct inet_sk_desc *sk) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -483,6 +487,8 @@ static int unlock_connection_info(struct inet_sk_info *si) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 841152643a..70ca16be4a 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -878,7 +878,8 @@ static int __dump_external_socket(struct unix_sk_desc *sk, struct unix_sk_desc * if (peer->type != SOCK_DGRAM) { show_one_unix("Ext stream not supported", peer); - pr_err("Can't dump half of stream unix connection.\n"); + pr_err("Can't dump half of stream unix connection. name: %s; peer name: %s\n", + sk->name, peer->name); return -1; } @@ -1430,32 +1431,22 @@ static int post_open_standalone(struct file_desc *d, int fd) static int restore_file_perms(struct unix_sk_info *ui) { - if (ui->ue->file_perms) { - FilePermsEntry *perms = ui->ue->file_perms; - char fname[PATH_MAX]; + FilePermsEntry *perms = ui->ue->file_perms; + char fname[PATH_MAX]; - if (ui->ue->name.len >= sizeof(fname)) { - pr_err("The file name is too long\n"); - return -E2BIG; - } - - memcpy(fname, ui->name, ui->ue->name.len); - fname[ui->ue->name.len] = '\0'; - - if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file owner and group"); - return -errno_cpy; - } + if (!perms) + return 0; - if (fchmodat(AT_FDCWD, fname, perms->mode, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file mode bits"); - return -errno_cpy; - } + if (ui->ue->name.len >= sizeof(fname)) { + pr_err("The file name is too long\n"); + errno = -E2BIG; + return -1; } - return 0; + memcpy(fname, ui->name, ui->ue->name.len); + fname[ui->ue->name.len] = '\0'; + + return cr_fchpermat(AT_FDCWD, fname, perms->uid, perms->gid, perms->mode, 0); } static int keep_deleted(struct unix_sk_info *ui) diff --git a/criu/sockets.c b/criu/sockets.c index d17e0a9869..560c765175 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -38,7 +38,7 @@ #define SOCK_DIAG_BY_FAMILY 20 #endif -#define SK_HASH_SIZE 32 +#define SK_HASH_SIZE (1 << 14) #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER diff --git a/criu/stats.c b/criu/stats.c index 0a9b4f5d47..40b90aaf2f 100644 --- a/criu/stats.c +++ b/criu/stats.c @@ -134,6 +134,9 @@ static void display_stats(int what, StatsEntry *stats) stats->dump->pages_skipped_parent, stats->dump->pages_skipped_parent); pr_msg("Memory pages written: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_written, stats->dump->pages_written); + if (stats->dump->has_skipped_zero_pages) + pr_msg("Memory pages skipped because zero: %" PRIu64 " (0x%" PRIx64 ")\n", + stats->dump->skipped_zero_pages, stats->dump->skipped_zero_pages); pr_msg("Lazy memory pages: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_lazy, stats->dump->pages_lazy); } else if (what == RESTORE_STATS) { @@ -178,6 +181,10 @@ void write_stats(int what) ds_entry.has_page_pipes = true; ds_entry.page_pipe_bufs = dstats->counts[CNT_PAGE_PIPE_BUFS]; ds_entry.has_page_pipe_bufs = true; + if (opts.skip_zero_pages) { + ds_entry.has_skipped_zero_pages = true; + ds_entry.skipped_zero_pages = dstats->counts[CNT_SKIPPED_ZERO_PAGES]; + } ds_entry.shpages_scanned = dstats->counts[CNT_SHPAGES_SCANNED]; ds_entry.has_shpages_scanned = true; diff --git a/criu/tty.c b/criu/tty.c index 199984ec08..ae23094b7b 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -22,6 +22,7 @@ #include "rst-malloc.h" #include "log.h" #include "common/list.h" +#include "util.h" #include "util-pie.h" #include "proc_parse.h" #include "file-ids.h" @@ -816,8 +817,26 @@ static int do_restore_tty_parms(void *arg, int fd, pid_t pid) * on termios too. Just to be on the safe side. */ - if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) - goto err; + if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) { + struct termios t; + + if (errno != EPERM) + goto err; + + memzero(&t, sizeof(t)); + if (ioctl(fd, TIOCGLCKTRMIOS, &t) < 0) { + pr_perror("Can't get tty locked params on %#x", p->tty_id); + goto err; + } + + /* + * The ioctl(TIOCSLCKTRMIOS) requires a CRIU process to be privileged + * in the init_user_ns, but if the current "termios_locked" value equal + * to the "termios_locked" value from the image, we can safely skip setting it. + */ + if (memcmp(&t, &p->tl, sizeof(struct termios)) != 0) + goto err; + } if ((p->has & HAS_TERMIOS) && ioctl(fd, TCSETS, &p->t) < 0) goto err; @@ -867,7 +886,7 @@ static int restore_tty_params(int fd, struct tty_info *info) } if (info->tie->has_uid && info->tie->has_gid) { - if (fchown(fd, info->tie->uid, info->tie->gid)) { + if (cr_fchown(fd, info->tie->uid, info->tie->gid)) { pr_perror("Can't setup uid %d gid %d on %#x", (int)info->tie->uid, (int)info->tie->gid, info->tfe->id); return -1; diff --git a/criu/tun.c b/criu/tun.c index 2e2cc32bf5..9d66f99296 100644 --- a/criu/tun.c +++ b/criu/tun.c @@ -455,27 +455,26 @@ int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **in TunLinkEntry tle = TUN_LINK_ENTRY__INIT; char spath[64]; char buf[64]; - int ret = 0; struct tun_link *tl; sprintf(spath, "class/net/%s/tun_flags", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.flags = strtol(buf, NULL, 0); sprintf(spath, "class/net/%s/owner", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.owner = strtol(buf, NULL, 10); sprintf(spath, "class/net/%s/group", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.group = strtol(buf, NULL, 10); - if (ret < 0) - return ret; - tl = get_tun_link_fd(nde->name, nde->peer_nsid, tle.flags); if (!tl) - return ret; + return -1; tle.vnethdr = tl->dmp.vnethdr; tle.sndbuf = tl->dmp.sndbuf; diff --git a/criu/util.c b/criu/util.c index db96cf938f..95ba0feda6 100644 --- a/criu/util.c +++ b/criu/util.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -662,40 +661,54 @@ int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], un return ret; } +struct child_args { + int *sk_pair; + int (*child_setup)(void); +}; + +static int child_func(void *_args) +{ + struct child_args *args = _args; + int sk, *sk_pair = args->sk_pair; + char c = 0; + + sk = sk_pair[1]; + close(sk_pair[0]); + + if (args->child_setup && args->child_setup() != 0) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + while (1) + sleep(1000); + exit(1); +} + pid_t fork_and_ptrace_attach(int (*child_setup)(void)) { pid_t pid; int sk_pair[2], sk; char c = 0; + struct child_args cargs = { + .sk_pair = sk_pair, + .child_setup = child_setup, + }; if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } - pid = fork(); + pid = clone_noasan(child_func, CLONE_UNTRACED | SIGCHLD, &cargs); if (pid < 0) { pr_perror("fork"); return -1; } - if (pid == 0) { - sk = sk_pair[1]; - close(sk_pair[0]); - - if (child_setup && child_setup() != 0) - exit(1); - - if (write(sk, &c, 1) != 1) { - pr_perror("write"); - exit(1); - } - - while (1) - sleep(1000); - exit(1); - } - sk = sk_pair[0]; close(sk_pair[1]); @@ -952,6 +965,89 @@ FILE *fopenat(int dirfd, char *path, char *cflags) return fdopen(tmp, cflags); } +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid) +{ + struct stat st; + + if (!fchown(fd, new_uid, new_gid)) + return 0; + if (errno != EPERM) + return -1; + + if (fstat(fd, &st) < 0) { + pr_perror("fstat() after fchown() for fd %d", fd); + goto out_eperm; + } + pr_debug("fstat(%d): uid %u gid %u\n", fd, st.st_uid, st.st_gid); + + if (new_uid != st.st_uid || new_gid != st.st_gid) + goto out_eperm; + + return 0; +out_eperm: + errno = EPERM; + return -1; +} + +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags) +{ + struct stat st; + int ret; + + if (fchownat(dirfd, path, new_uid, new_gid, flags) < 0 && errno != EPERM) { + int errno_cpy = errno; + pr_perror("Unable to change [%d]/%s ownership to (%d, %d)", + dirfd, path, new_uid, new_gid); + errno = errno_cpy; + return -1; + } + + if (fstatat(dirfd, path, &st, flags) < 0) { + int errno_cpy = errno; + pr_perror("Unable to stat [%d]/%s", dirfd, path); + errno = errno_cpy; + return -1; + } + + if (new_uid != st.st_uid || new_gid != st.st_gid) { + errno = EPERM; + pr_perror("Unable to change [%d]/%s ownership (%d, %d) to (%d, %d)", + dirfd, path, st.st_uid, st.st_gid, new_uid, new_gid); + errno = EPERM; + return -1; + } + + if (new_mode == st.st_mode) + return 0; + + if (S_ISLNK(st.st_mode)) { + /* + * We have no lchmod() function, and fchmod() will fail on + * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() + * function and flag AT_SYMLINK_NOFOLLOW described in + * man 2 fchmodat, but it is not currently implemented. %) + */ + return 0; + } + + if (!*path && flags & AT_EMPTY_PATH) + ret = fchmod(dirfd, new_mode); + else + ret = fchmodat(dirfd, path, new_mode, flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)); + if (ret < 0) { + int errno_cpy = errno; + pr_perror("Unable to set perms %o on [%d]/%s", new_mode, dirfd, path); + errno = errno_cpy; + } + + return ret; +} + +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode) +{ + return cr_fchpermat(fd, "", new_uid, new_gid, new_mode, AT_EMPTY_PATH); +} + void split(char *str, char token, char ***out, int *n) { int i; @@ -1072,20 +1168,6 @@ const char *ns_to_string(unsigned int ns) } } -void tcp_cork(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_perror("Unable to restore TCP_CORK (%d)", val); -} - -void tcp_nodelay(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_perror("Unable to restore TCP_NODELAY (%d)", val); -} - static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) { memset(addr, 0, sizeof(*addr)); @@ -1566,7 +1648,7 @@ static int is_iptables_nft(char *bin) goto err; } - ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, 0); + ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, CRS_CAN_FAIL); if (ret) { pr_err("%s -V failed\n", cmd[0]); goto err; @@ -1880,11 +1962,16 @@ uint64_t criu_run_id; void util_init(void) { - struct timespec tp; + struct stat statbuf; + + criu_run_id = getpid(); + if (!stat("/proc/self/ns/pid", &statbuf)) + criu_run_id |= (uint64_t)statbuf.st_ino << 32; + else if (errno != ENOENT) + pr_perror("Can't stat /proc/self/ns/pid - CRIU run id might not be unique"); - clock_gettime(CLOCK_MONOTONIC, &tp); - criu_run_id = ((uint64_t)getpid() << 32) + tp.tv_sec + tp.tv_nsec; compel_run_id = criu_run_id; + pr_info("CRIU run id = %#" PRIx64 "\n", criu_run_id); } /* diff --git a/flog/Makefile b/flog/Makefile new file mode 100644 index 0000000000..12255af719 --- /dev/null +++ b/flog/Makefile @@ -0,0 +1,29 @@ +OPTS=-ggdb3 -Wall -Werror +export OPTS + +CFLAGS += -iquote include +CFLAGS += -iquote flog/include +CFLAGS += -iquote flog/include/uapi + +include $(__nmk_dir)msg.mk + +$(eval $(call gen-built-in,src)) + +flog: + $(Q) $(MAKE) $(build)=$(obj)/src all +.PHONY: flog + +clean-flog: + $(call msg-gen, $@) + $(Q) $(MAKE) $(build)=$(obj)/src clean + $(Q) $(RM) built-in.o +.PHONY: clean-flog + +clean: clean-flog +mrproper: clean + +test: + ./tests/test00 + +all-y += flog + diff --git a/flog/built-in.S b/flog/built-in.S new file mode 100644 index 0000000000..26627d0544 --- /dev/null +++ b/flog/built-in.S @@ -0,0 +1,4 @@ +SECTIONS +{ + .rodata : { _rodata_start = . ; *(.rodata*) ; _rodata_end = . ;} +} diff --git a/flog/include/compiler.h b/flog/include/compiler.h new file mode 100644 index 0000000000..80264ec631 --- /dev/null +++ b/flog/include/compiler.h @@ -0,0 +1,77 @@ +#ifndef __COMPILER_H__ +#define __COMPILER_H__ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +#define __section(S) __attribute__((__section__(#S))) + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef always_inline +#define always_inline __always_inline +#endif + +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +#define __aligned(x) __attribute__((aligned(x))) + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER) +#endif + +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) + +#define min(x, y) \ + ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void)(&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ + }) + +#define max(x, y) \ + ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void)(&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; \ + }) + +#define is_log2(v) (((v) & ((v)-1)) == 0) + +#endif /* __COMPILER_H__ */ diff --git a/flog/include/flog.h b/flog/include/flog.h new file mode 100644 index 0000000000..f00c20541f --- /dev/null +++ b/flog/include/flog.h @@ -0,0 +1,9 @@ +#ifndef __FLOG_H__ +#define __FLOG_H__ + +#include +#include + +#include "uapi/flog.h" + +#endif /* __FLOG_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h new file mode 100644 index 0000000000..8aafe44b75 --- /dev/null +++ b/flog/include/log.h @@ -0,0 +1,17 @@ +#ifndef __LOG_H__ +#define __LOG_H__ + +#include + +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) + +#if 1 +#define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#else +#define pr_debug(fmt, ...) +#endif + +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) + +#endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h new file mode 100644 index 0000000000..07c992968b --- /dev/null +++ b/flog/include/types.h @@ -0,0 +1,16 @@ +#ifndef __FLOG_TYPES_H__ +#define __FLOG_TYPES_H__ + +#include +#include + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h new file mode 100644 index 0000000000..5fb71b1053 --- /dev/null +++ b/flog/include/uapi/flog.h @@ -0,0 +1,139 @@ +#ifndef __UAPI_FLOG_H__ +#define __UAPI_FLOG_H__ + +#include +#include +#include + +/* + * We work with up to 32 arguments in macros here. + * If more provided -- behaviour is undefined. + */ + +/* + * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s + */ +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) + +#define FLOG_PP_ARG_N(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, \ + _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, N, ...) \ + N + +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, \ + 2, 1, 0 + +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) + +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 + +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) + +#define flog_genbit(ord, n, v, ...) \ + _Generic((v), \ + \ + /* Basic types */ \ + char: 0, \ + signed char: 0, \ + unsigned char: 0, \ + signed short int: 0, \ + unsigned short int: 0, \ + signed int: 0, \ + unsigned int: 0, \ + signed long: 0, \ + unsigned long: 0, \ + signed long long: 0, \ + unsigned long long: 0, \ + \ + /* Not used for a while */ \ + /* float: 12, */ \ + /* double: 13, */ \ + /* long double: 14, */ \ + \ + /* Basic pointers */ \ + char *: (1u << (ord - n - 1)), \ + signed char *: (1u << (ord - n - 1)), \ + unsigned char *: (1u << (ord - n - 1)), \ + signed short int *: 0, \ + unsigned short int *: 0, \ + signed int *: 0, \ + unsigned int *: 0, \ + signed long *: 0, \ + unsigned long *: 0, \ + signed long long *: 0, \ + unsigned long long *: 0, \ + void *: 0, \ + \ + /* Const basic pointers */ \ + const char *: (1u << (ord - n - 1)), \ + const signed char *: (1u << (ord - n - 1)), \ + const unsigned char *: (1u << (ord - n - 1)), \ + const signed short int *: 0, \ + const unsigned short int *: 0, \ + const signed int *: 0, \ + const unsigned int *: 0, \ + const signed long *: 0, \ + const unsigned long *: 0, \ + const signed long long *: 0, \ + const unsigned long long *: 0, \ + const void *: 0, \ + \ + /* Systypes and pointers */ \ + default: -1) + +typedef struct { + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; +} flog_msg_t; + +extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); +void flog_decode_msg(int fdout, const char *format, ...); +extern int flog_decode_all(int fdin, int fdout); + +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) + +int flog_map_buf(int fdout); +int flog_close(int fdout); + +#endif /* __UAPI_FLOG_H__ */ diff --git a/flog/include/util.h b/flog/include/util.h new file mode 100644 index 0000000000..7b1edb6885 --- /dev/null +++ b/flog/include/util.h @@ -0,0 +1,41 @@ +#ifndef __UTIL_H__ +#define __UTIL_H__ + +#include +#include + +#include "log.h" +#include "types.h" + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) \ + do { \ + if (p) \ + free(p); \ + } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +#endif /* __UTIL_H__ */ diff --git a/flog/src/Makefile b/flog/src/Makefile new file mode 100644 index 0000000000..ee73ea7252 --- /dev/null +++ b/flog/src/Makefile @@ -0,0 +1,5 @@ +ccflags-y += -DCONFIG_X86_64 -iquote ./include $(OPTS) +ldflags-y += -r + +#obj-y += main.o +obj-y += flog.o diff --git a/flog/src/flog.c b/flog/src/flog.c new file mode 100644 index 0000000000..d7660f18d8 --- /dev/null +++ b/flog/src/flog.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include + +//#include + +#include "uapi/flog.h" +#include "util.h" + +#define MAGIC 0xABCDABCD + +#define BUF_SIZE (1 << 20) +static char _mbuf[BUF_SIZE]; +static char *mbuf = _mbuf; +static char *fbuf; +static uint64_t fsize; +static uint64_t mbuf_size = sizeof(_mbuf); + +/*int flog_decode_all(int fdin, int fdout) +{ + flog_msg_t *m = (void *)mbuf; + ffi_type *args[34] = { + [0] = &ffi_type_sint, + [1] = &ffi_type_pointer, + [2 ... 33] = &ffi_type_slong + }; + void *values[34]; + ffi_cif cif; + ffi_arg rc; + size_t i, ret; + char *fmt; + + values[0] = (void *)&fdout; + + while (1) { + ret = read(fdin, mbuf, sizeof(m)); + if (ret == 0) + break; + if (ret < 0) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + if (m->magic != MAGIC) { + fprintf(stderr, "The log file was not properly closed\n"); + break; + } + ret = m->size - sizeof(m); + if (m->size > mbuf_size) { + fprintf(stderr, "The buffer is too small"); + return -1; + } + if (read(fdin, mbuf + sizeof(m), ret) != ret) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + + fmt = mbuf + m->fmt; + values[1] = &fmt; + + for (i = 0; i < m->nargs; i++) { + values[i + 2] = (void *)&m->args[i]; + if (m->mask & (1u << i)) { + m->args[i] = (long)(mbuf + m->args[i]); + } + } + + if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, m->nargs + 2, + &ffi_type_sint, args) == FFI_OK) + ffi_call(&cif, FFI_FN(dprintf), &rc, values); + } + return 0; +}*/ + +static int flog_enqueue(flog_msg_t *m) +{ + if (write(1, m, m->size) != m->size) { + fprintf(stderr, "Unable to write a message\n"); + return -1; + } + return 0; +} + +/*extern char *rodata_start; +extern char *rodata_end; +*/ +/* Pre-allocate a buffer in a file and map it into memory. */ +int flog_map_buf(int fdout) +{ + uint64_t off = 0; + void *addr; + + /* + * Two buffers are mmapped into memory. A new one is mapped when a first + * one is completely filled. + */ + if (fbuf && (mbuf - fbuf < BUF_SIZE)) + return 0; + + if (fbuf) { + if (munmap(fbuf, BUF_SIZE * 2)) { + fprintf(stderr, "Unable to unmap a buffer: %m"); + return -1; + } + off = mbuf - fbuf - BUF_SIZE; + fbuf = NULL; + } + + if (fsize == 0) + fsize += BUF_SIZE; + fsize += BUF_SIZE; + + if (ftruncate(fdout, fsize)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + + if (!fbuf) + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, fdout, + fsize - 2 * BUF_SIZE); + else + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, BUF_SIZE * 2, MREMAP_FIXED, fbuf); + if (addr == MAP_FAILED) { + fprintf(stderr, "Unable to map a buffer: %m"); + return -1; + } + + fbuf = addr; + mbuf = fbuf + off; + mbuf_size = 2 * BUF_SIZE; + + return 0; +} + +int flog_close(int fdout) +{ + if (mbuf == _mbuf) + return 0; + + munmap(fbuf, BUF_SIZE * 2); + + if (ftruncate(fdout, fsize - 2 * BUF_SIZE + mbuf - fbuf)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + return 0; +} + +int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...) +{ + flog_msg_t *m; + va_list argptr; + char *str_start, *p; + size_t i; + + if (mbuf != _mbuf && flog_map_buf(fdout)) + return -1; + + m = (void *)mbuf; + + m->nargs = nargs; + m->mask = mask; + + str_start = (void *)m->args + sizeof(m->args[0]) * nargs; + p = memccpy(str_start, format, 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->fmt = str_start - mbuf; + str_start = p; + + va_start(argptr, format); + for (i = 0; i < nargs; i++) { + m->args[i] = (long)va_arg(argptr, long); + /* + * If we got a string, we should either + * reference it when in rodata, or make + * a copy (FIXME implement rodata refs). + */ + if (mask & (1u << i)) { + p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + va_end(argptr); + return -1; + } + m->args[i] = str_start - mbuf; + str_start = p; + } + } + va_end(argptr); + m->size = str_start - mbuf; + + /* + * A magic is required to know where we stop writing into a log file, + * if it was not properly closed. The file is mapped into memory, so a + * space in the file is allocated in advance and at the end it can have + * some unused tail. + */ + m->magic = MAGIC; + + m->size = roundup(m->size, 8); + if (mbuf == _mbuf) { + if (flog_enqueue(m)) + return -1; + } else { + mbuf += m->size; + mbuf_size -= m->size; + } + return 0; +} diff --git a/flog/src/main.c b/flog/src/main.c new file mode 100644 index 0000000000..e027917c68 --- /dev/null +++ b/flog/src/main.c @@ -0,0 +1,158 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "flog.h" + +extern char _rodata_start, _rodata_end; +char *rodata_start = &_rodata_start; +char *rodata_end = &_rodata_end; + +enum { + MODE_BINARY, + MODE_FPRINTF, + MODE_SPRINTF, + MODE_DPRINTF, +}; + +int main(int argc, char *argv[]) +{ + static const char str1[] = "String1 String1"; + static const char str2[] = "string2 string2 string2"; + int fdout = STDOUT_FILENO; + bool use_decoder = false; + int mode = MODE_BINARY; + size_t niter = 100; + int opt, idx; + size_t i; + + static const char short_opts[] = "m:o:di:h"; + static struct option long_opts[] = { + { "mode", required_argument, 0, 'm' }, { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, {}, + }; + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch (opt) { + case 'm': + if (strcmp(optarg, "binary") == 0) { + mode = MODE_BINARY; + } else if (strcmp(optarg, "fprintf") == 0) { + mode = MODE_FPRINTF; + } else if (strcmp(optarg, "sprintf") == 0) { + mode = MODE_SPRINTF; + } else if (strcmp(optarg, "dprintf") == 0) { + mode = MODE_DPRINTF; + } else + goto usage; + break; + case 'o': + if (strcmp(optarg, "stdout") == 0) { + fdout = fileno(stdout); + } else if (strcmp(optarg, "stderr") == 0) { + fdout = fileno(stderr); + } else { + fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fdout < 0) { + fprintf(stderr, "Can't open %s: %s\n", optarg, strerror(errno)); + exit(1); + } + } + break; + case 'i': + niter = atoi(optarg); + break; + case 'd': + use_decoder = true; + break; + case 'h': + default: + goto usage; + } + } + + switch (mode) { + case MODE_BINARY: + if (use_decoder) + return flog_decode_all(STDIN_FILENO, fdout); + + if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) + return 1; + for (i = 0; i < niter; i++) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, + (short)2, (unsigned long)2)) + return 1; + if (flog_close(fdout)) + return 1; + break; + case MODE_DPRINTF: { + for (i = 0; i < niter; i++) { + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + case MODE_FPRINTF: { + FILE *f = fdopen(fdout, "w"); + + for (i = 0; i < niter; i++) { + fprintf(f, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + fflush(f); + } + fclose(f); + break; + } + case MODE_SPRINTF: { + static char buf[4096]; + + for (i = 0; i < niter; i++) { + sprintf(buf, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + default: + return 1; + } + + return 0; +usage: + fprintf(stderr, "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writing results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); + return 1; +} diff --git a/flog/tests/test00 b/flog/tests/test00 new file mode 100755 index 0000000000..a7937e4a18 --- /dev/null +++ b/flog/tests/test00 @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e -x + +echo Map a log file into memory +time ./flog run -i 1000000 -o /tmp/flog.raw.map +echo Write into a log file +time ./flog run -i 1000000 > /tmp/flog.raw +echo Use fprintf +time ./flog run -m fprintf -i 1000000 -o /tmp/flog.fprintf.txt +echo Use dprintf +time ./flog run -m dprintf -i 1000000 -o /tmp/flog.dprintf.txt +echo Use sprintf +time ./flog run -m sprintf -i 1000000 + +time ./flog run -d < /tmp/flog.raw > /tmp/flog.raw.txt +cmp /tmp/flog.raw.txt /tmp/flog.fprintf.txt + +time ./flog run -d < /tmp/flog.raw.map > /tmp/flog.raw.map.txt +cmp /tmp/flog.raw.map.txt /tmp/flog.fprintf.txt + +cmp /tmp/flog.dprintf.txt /tmp/flog.fprintf.txt diff --git a/images/Makefile b/images/Makefile index 004e22ec3f..ca85b1a213 100644 --- a/images/Makefile +++ b/images/Makefile @@ -2,6 +2,7 @@ proto-obj-y += stats.o proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-mips.o +proto-obj-y += core-loongarch64.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o diff --git a/images/core-loongarch64.proto b/images/core-loongarch64.proto new file mode 100755 index 0000000000..8258f006ea --- /dev/null +++ b/images/core-loongarch64.proto @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +message user_loongarch64_gpregs_entry { + repeated uint64 regs = 1; + required uint64 pc = 2; +} + +message user_loongarch64_fpregs_entry { + repeated uint64 regs = 1; + required uint64 fcc = 2; + required uint32 fcsr = 3; +} + +message thread_info_loongarch64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_loongarch64_gpregs_entry gpregs = 3[(criu).hex = true]; + required user_loongarch64_fpregs_entry fpregs = 4[(criu).hex = true]; +} diff --git a/images/core.proto b/images/core.proto index eddd1dc555..5b07b5c448 100644 --- a/images/core.proto +++ b/images/core.proto @@ -8,6 +8,7 @@ import "core-aarch64.proto"; import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; +import "core-loongarch64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -63,6 +64,8 @@ message task_core_entry { optional uint64 blk_sigset_extended = 20[(criu).hex = true]; optional uint32 stop_signo = 21; + + optional uint32 membarrier_registration_mask = 22 [(criu).hex = true]; } message task_kobj_ids_entry { @@ -122,6 +125,7 @@ message core_entry { PPC64 = 4; S390 = 5; MIPS = 6; + LOONGARCH64 = 7; } required march mtype = 1; @@ -131,6 +135,7 @@ message core_entry { optional thread_info_ppc64 ti_ppc64 = 9; optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; + optional thread_info_loongarch64 ti_loongarch64 = 12; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; diff --git a/images/creds.proto b/images/creds.proto index 6228f7fcbb..220ed38587 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -24,4 +24,5 @@ message creds_entry { optional string lsm_profile = 15; optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; + optional uint32 no_new_privs = 18; } diff --git a/images/memfd.proto b/images/memfd.proto index 0e625416a7..bb0be4a6fc 100644 --- a/images/memfd.proto +++ b/images/memfd.proto @@ -22,4 +22,5 @@ message memfd_inode_entry { required uint32 seals = 6 [(criu).flags = "seals.flags"]; required uint64 inode_id = 7; optional uint32 hugetlb_flag = 8; + optional uint32 mode = 9; }; diff --git a/images/rpc.proto b/images/rpc.proto index afd2c7b43f..b73799a8c2 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -52,6 +52,7 @@ enum criu_cg_mode { enum criu_network_lock_method { IPTABLES = 1; NFTABLES = 2; + SKIP = 3; }; enum criu_pre_dump_mode { @@ -60,7 +61,8 @@ enum criu_pre_dump_mode { }; message criu_opts { - required int32 images_dir_fd = 1; + required int32 images_dir_fd = 1 [default = -1]; + optional string images_dir = 68; /* used only if images_dir_fd == -1 */ optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional bool leave_running = 3; @@ -140,6 +142,10 @@ message criu_opts { optional bool mntns_compat_mode = 65; optional bool skip_file_rwx_check = 66; optional bool unprivileged = 67; + optional bool leave_stopped = 69; + optional bool display_stats = 70; + optional bool log_to_stderr = 71; + optional bool skip_zero_pages = 72; /* optional bool check_mounts = 128; */ } diff --git a/images/sk-inet.proto b/images/sk-inet.proto index ee1f0ae410..666326fa40 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -19,6 +19,7 @@ message ip_opts_entry { optional ip_opts_raw_entry raw = 4; optional bool pktinfo = 5; + optional uint32 tos = 6; } message inet_sk_entry { diff --git a/images/stats.proto b/images/stats.proto index 64e46181da..aafd89992d 100644 --- a/images/stats.proto +++ b/images/stats.proto @@ -22,6 +22,8 @@ message dump_stats_entry { optional uint64 shpages_scanned = 12; optional uint64 shpages_skipped_parent = 13; optional uint64 shpages_written = 14; + + optional uint64 skipped_zero_pages = 15; } message restore_stats_entry { diff --git a/include/common/arch/aarch64/asm/page.h b/include/common/arch/aarch64/asm/page.h index 90670d1265..4555debbdc 100644 --- a/include/common/arch/aarch64/asm/page.h +++ b/include/common/arch/aarch64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/loongarch64/asm/atomic.h b/include/common/arch/loongarch64/asm/atomic.h new file mode 100644 index 0000000000..9017254397 --- /dev/null +++ b/include/common/arch/loongarch64/asm/atomic.h @@ -0,0 +1,62 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +#include +#include "common/compiler.h" + +typedef struct { + int counter; +} atomic_t; + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(v->counter), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add(1, v) +#define atomic_inc_return(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(ptr->counter) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/loongarch64/asm/bitops.h b/include/common/arch/loongarch64/asm/bitops.h new file mode 100644 index 0000000000..170e4f7369 --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitops.h @@ -0,0 +1,24 @@ +#ifndef _LINUX_BITOPS_H +#define _LINUX_BITOPS_H +#include "common/asm-generic/bitops.h" + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ + +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((1UL << ((nr) / BITS_PER_LONG)) - 1) +static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long res, mask; + mask = BIT_MASK(nr); + asm volatile("amor_db.d %0, %2, %1" : "=&r"(res), "+ZB"(addr[BIT_WORD(nr)]) : "r"(mask) : "memory"); + return (res & mask) != 0; +} + +#endif diff --git a/include/common/arch/loongarch64/asm/bitsperlong.h b/include/common/arch/loongarch64/asm/bitsperlong.h new file mode 100644 index 0000000000..13d06a384e --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG _LOONGARCH_SZLONG + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/loongarch64/asm/linkage.h b/include/common/arch/loongarch64/asm/linkage.h new file mode 100644 index 0000000000..448acc29fc --- /dev/null +++ b/include/common/arch/loongarch64/asm/linkage.h @@ -0,0 +1,19 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + __ALIGN; \ + .type name, @function; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h new file mode 100644 index 0000000000..4fcdb64dc1 --- /dev/null +++ b/include/common/arch/loongarch64/asm/page.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned __page_size; +static unsigned __page_shift; + +static inline unsigned long page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SIZE page_size() +#define PAGE_SHIFT page_shift() +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#else /* CR_NOGLIBC */ + +extern unsigned long page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/include/common/arch/mips/asm/page.h b/include/common/arch/mips/asm/page.h index 25bdbc1412..4fcdb64dc1 100644 --- a/include/common/arch/mips/asm/page.h +++ b/include/common/arch/mips/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/ppc64/asm/page.h b/include/common/arch/ppc64/asm/page.h index a1ff6718ad..2b0c0e5042 100644 --- a/include/common/arch/ppc64/asm/page.h +++ b/include/common/arch/ppc64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/compiler.h b/include/common/compiler.h index bd3de01df1..1347b62362 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -30,6 +30,17 @@ #define __always_unused __attribute__((unused)) #define __must_check __attribute__((__warn_unused_result__)) +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +/* Not supported by clang */ +#if __has_attribute(__externally_visible__) +#define __visible __attribute__((__externally_visible__)) +#else +#define __visible +#endif + #define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline @@ -47,7 +58,9 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __aligned #define __aligned(x) __attribute__((aligned(x))) +#endif /* * Macro to define stack alignment. diff --git a/include/common/scm.h b/include/common/scm.h index bcb198882b..5b6f78a8bd 100644 --- a/include/common/scm.h +++ b/include/common/scm.h @@ -11,7 +11,7 @@ * Because of kernel doing kmalloc for user data passed * in SCM messages, and there is kernel's SCM_MAX_FD as a limit * for descriptors passed at once we're trying to reduce - * the pressue on kernel memory manager and use predefined + * the pressure on kernel memory manager and use predefined * known to work well size of the message buffer. */ #define CR_SCM_MSG_SIZE (1024) diff --git a/lib/.gitignore b/lib/.gitignore new file mode 100644 index 0000000000..a10181b800 --- /dev/null +++ b/lib/.gitignore @@ -0,0 +1 @@ +pycriu.egg-info/ diff --git a/lib/Makefile b/lib/Makefile index ff540fb75d..ae371e78e0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,6 +4,9 @@ UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/ve all-y += lib-c lib-a lib-py +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES := 0 + # # C language bindings. lib/c/Makefile: ; @@ -25,23 +28,23 @@ lib-a: lib/c/$(CRIU_A) # # Python bindings. -lib/py/Makefile: ; -lib/py/%: .FORCE +lib/pycriu/Makefile: ; +lib/pycriu/%: .FORCE $(call msg-gen, $@) - $(Q) $(MAKE) $(build)=lib/py $@ + $(Q) $(MAKE) $(build)=lib/pycriu $@ lib-py: - $(Q) $(MAKE) $(build)=lib/py all + $(Q) $(MAKE) $(build)=lib/pycriu all .PHONY: lib-py clean-lib: $(Q) $(MAKE) $(build)=lib/c clean - $(Q) $(MAKE) $(build)=lib/py clean + $(Q) $(MAKE) $(build)=lib/pycriu clean .PHONY: clean-lib clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc mrproper: clean -install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in +install: lib-c lib-a lib-py lib/c/criu.pc.in $(E) " INSTALL " lib $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) @@ -54,9 +57,17 @@ install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig -ifeq ($(PYTHON),python3) - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --force-reinstall --prefix=$(DESTDIR)$(PREFIX) ./crit +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP INSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" +else + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib +endif +else + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib endif .PHONY: install @@ -69,8 +80,16 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) -ifeq ($(PYTHON),python3) - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP UNINSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" +else + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu +endif +else + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu endif .PHONY: uninstall diff --git a/lib/c/criu.c b/lib/c/criu.c index fc8159999c..a8402233a9 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -387,6 +387,17 @@ void criu_set_auto_dedup(bool auto_dedup) criu_local_set_auto_dedup(global_opts, auto_dedup); } +void criu_local_set_skip_zero_pages(criu_opts *opts, bool skip_zero_pages) +{ + opts->rpc->has_skip_zero_pages = true; + opts->rpc->skip_zero_pages = skip_zero_pages; +} + +void criu_set_skip_zero_pages(bool skip_zero_pages) +{ + criu_local_set_skip_zero_pages(global_opts, skip_zero_pages); +} + void criu_local_set_force_irmap(criu_opts *opts, bool force_irmap) { opts->rpc->has_force_irmap = true; @@ -1868,7 +1879,7 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { opts->rpc->has_network_lock = true; - if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES) { + if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } @@ -2030,3 +2041,14 @@ int criu_feature_check(struct criu_feature_check *features, size_t size) { return criu_local_feature_check(global_opts, features, size); } + +void criu_local_set_empty_ns(criu_opts *opts, int namespaces) +{ + opts->rpc->has_empty_ns = true; + opts->rpc->empty_ns = namespaces; +} + +void criu_set_empty_ns(int namespaces) +{ + criu_local_set_empty_ns(global_opts, namespaces); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index 28a083d88d..c1c6078698 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -50,6 +50,7 @@ enum criu_cg_mode { enum criu_network_lock_method { CRIU_NETWORK_LOCK_IPTABLES = 1, CRIU_NETWORK_LOCK_NFTABLES = 2, + CRIU_NETWORK_LOCK_SKIP = 3, }; enum criu_pre_dump_mode { CRIU_PRE_DUMP_SPLICE = 1, CRIU_PRE_DUMP_READ = 2 }; @@ -322,6 +323,9 @@ struct criu_feature_check { int criu_feature_check(struct criu_feature_check *features, size_t size); int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size); +void criu_local_set_empty_ns(criu_opts *opts, int namespaces); +void criu_set_empty_ns(int namespaces); + #ifdef __GNUG__ } #endif diff --git a/lib/py/.gitignore b/lib/py/.gitignore deleted file mode 100644 index d3090fca32..0000000000 --- a/lib/py/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*_pb2.py -*.pyc diff --git a/lib/pycriu/.gitignore b/lib/pycriu/.gitignore new file mode 100644 index 0000000000..111642787a --- /dev/null +++ b/lib/pycriu/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +*_pb2.py +*.pyc +version.py diff --git a/lib/py/Makefile b/lib/pycriu/Makefile similarity index 66% rename from lib/py/Makefile rename to lib/pycriu/Makefile index 691b6bdd33..5ce9bc8f7e 100644 --- a/lib/py/Makefile +++ b/lib/pycriu/Makefile @@ -1,4 +1,4 @@ -all-y += libpy-images rpc_pb2.py +all-y += libpy-images rpc_pb2.py version.py $(obj)/images/Makefile: ; $(obj)/images/%: .FORCE @@ -11,7 +11,10 @@ libpy-images: rpc_pb2.py: $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) -cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) +version.py: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $(obj)/$@ + +cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc version.py) clean-lib-py: $(Q) $(MAKE) $(build)=$(obj)/images clean diff --git a/lib/py/__init__.py b/lib/pycriu/__init__.py similarity index 68% rename from lib/py/__init__.py rename to lib/pycriu/__init__.py index 96b3e9526c..2abcf029de 100644 --- a/lib/py/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,3 +1,4 @@ from . import rpc_pb2 as rpc from . import images from .criu import * +from .version import __version__ \ No newline at end of file diff --git a/lib/py/criu.py b/lib/pycriu/criu.py similarity index 100% rename from lib/py/criu.py rename to lib/pycriu/criu.py diff --git a/lib/py/images/.gitignore b/lib/pycriu/images/.gitignore similarity index 100% rename from lib/py/images/.gitignore rename to lib/pycriu/images/.gitignore diff --git a/lib/py/images/Makefile b/lib/pycriu/images/Makefile similarity index 100% rename from lib/py/images/Makefile rename to lib/pycriu/images/Makefile diff --git a/lib/py/images/__init__.py b/lib/pycriu/images/__init__.py similarity index 100% rename from lib/py/images/__init__.py rename to lib/pycriu/images/__init__.py diff --git a/lib/py/images/images.py b/lib/pycriu/images/images.py similarity index 98% rename from lib/py/images/images.py rename to lib/pycriu/images/images.py index a1d76e7cf2..9db506e1ee 100644 --- a/lib/py/images/images.py +++ b/lib/pycriu/images/images.py @@ -42,7 +42,6 @@ import struct import os import array -import sys from . import magic from . import pb @@ -71,18 +70,12 @@ def __init__(self, magic): def decode_base64_data(data): """A helper function to decode base64 data.""" - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(data)) - else: - return base64.decodebytes(data) + return base64.decodebytes(str.encode(data)) def write_base64_data(f, data): """A helper function to write base64 encoded data to a file.""" - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(data))) - else: - f.write(base64.decodebytes(data)) + f.write(base64.decodebytes(str.encode(data))) # Generic class to handle loading/dumping criu images entries from/to bin diff --git a/lib/py/images/pb2dict.py b/lib/pycriu/images/pb2dict.py similarity index 97% rename from lib/py/images/pb2dict.py rename to lib/pycriu/images/pb2dict.py index 9d581c3750..3f5f390e39 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -3,7 +3,6 @@ import os import quopri import socket -import sys from ipaddress import IPv4Address, IPv6Address, ip_address from google.protobuf.descriptor import FieldDescriptor as FD @@ -103,6 +102,7 @@ def _custom_conv(field): ('VMA_AREA_SOCKET', 1 << 11), ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), + ('VMA_AREA_MEMFD', 1 << 14), ('VMA_UNSUPP', 1 << 31), ] @@ -247,17 +247,11 @@ def encode_dev(field, value): def encode_base64(value): - if (sys.version_info > (3, 0)): - return base64.encodebytes(value).decode() - else: - return base64.encodebytes(value) + return base64.encodebytes(value).decode() def decode_base64(value): - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(value)) - else: - return base64.decodebytes(value) + return base64.decodebytes(str.encode(value)) def encode_unix(value): @@ -371,7 +365,7 @@ def pb2dict(pb, pretty=False, is_hex=False): def _dict2pb_cast(field, value): # Not considering TYPE_MESSAGE here, as repeated # and non-repeated messages need special treatment - # in this case, and are hadled separately. + # in this case, and are handled separately. if field.type == FD.TYPE_BYTES: return get_bytes_dec(field)(value) elif field.type == FD.TYPE_ENUM: diff --git a/lib/pyproject.toml b/lib/pyproject.toml new file mode 100644 index 0000000000..8eb4b7084d --- /dev/null +++ b/lib/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools", "protobuf<4.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pycriu" +description = "Python bindings for CRIU" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[tool.setuptools] +packages = ["pycriu", "pycriu.images"] + +[tool.setuptools.dynamic] +version = {attr = "pycriu.__version__"} diff --git a/lib/setup.cfg b/lib/setup.cfg new file mode 100644 index 0000000000..23ee48dd5b --- /dev/null +++ b/lib/setup.cfg @@ -0,0 +1,16 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = pycriu +description = Python bindings for CRIU +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: pycriu.__version__ + +[options] +packages = find: +python_requires = >=3.6 diff --git a/coredump/coredump-python3 b/lib/setup.py old mode 100755 new mode 100644 similarity index 55% rename from coredump/coredump-python3 rename to lib/setup.py index 3032dbadf1..618ac1de48 --- a/coredump/coredump-python3 +++ b/lib/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import setuptools -import coredump if __name__ == '__main__': - coredump.main() + setuptools.setup() diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 6809ec8b9a..1078eafe6f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -263,7 +263,7 @@ ROCm | Radeon Open Compute Platform Thunk | User-mode API interface to interact with amdgpu.ko KFD | AMD Kernel Fusion Driver Mesa | Open source OpenGL implementation -GTT | Graphis Translation Table, also used to denote kernel-managed system memory for GPU access +GTT | Graphics Translation Table, also used to denote kernel-managed system memory for GPU access VRAM | Video RAM BO | Buffer Object HMM | Heterogeneous Memory Management diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 0a55e34a2b..32ff8f9364 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -107,6 +107,8 @@ extern bool kfd_vram_size_check; extern bool kfd_numa_check; extern bool kfd_capability_check; +size_t kfd_max_buffer_size; + /**************************************************************************************************/ int write_fp(FILE *fp, const void *buf, const size_t buf_len) @@ -163,6 +165,7 @@ FILE *open_img_file(char *path, bool write, size_t *size) fp = fdopen(fd, write ? "w" : "r"); if (!fp) { pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read"); + close(fd); return NULL; } @@ -449,9 +452,51 @@ void getenv_bool(const char *var, bool *value) pr_info("param: %s:%s\n", var, *value ? "Y" : "N"); } +void getenv_size_t(const char *var, size_t *value) +{ + char *value_str = getenv(var); + char *endp = value_str; + int sh = 0; + size_t size; + + pr_info("Value str: %s\n", value_str); + + if (value_str) { + size = (size_t)strtoul(value_str, &endp, 0); + if (errno || value_str == endp) { + pr_err("Ignoring invalid value for %s=%s, expecting a positive integer\n", var, value_str); + return; + } + switch (*endp) { + case 'k': + case 'K': + sh = 10; + break; + case 'M': + sh = 20; + break; + case 'G': + sh = 30; + break; + case '\0': + sh = 0; + break; + default: + pr_err("Ignoring invalid size suffix for %s=%s, expecting 'K'/k', 'M', or 'G'\n", var, value_str); + return; + } + if (SIZE_MAX >> sh < size) { + pr_err("Ignoring invalid value for %s=%s, exceeds SIZE_MAX\n", var, value_str); + return; + } + *value = size << sh; + } + pr_info("param: %s:0x%lx\n", var, *value); +} + int amdgpu_plugin_init(int stage) { - pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); topology_init(&dest_topology); @@ -476,12 +521,15 @@ int amdgpu_plugin_init(int stage) getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check); getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check); } + kfd_max_buffer_size = 0; + getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size); + return 0; } void amdgpu_plugin_fini(int stage, int ret) { - pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) sys_close_drm_render_devices(&dest_topology); @@ -513,7 +561,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) char img_path[128]; int ret = 0; - pr_debug("amdgpu_plugin: Enter %s\n", __func__); + pr_debug("Enter %s\n", __func__); ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { pr_perror("stat error for /dev/kfd"); @@ -539,7 +587,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) return 0; } - pr_perror("amdgpu_plugin: Can't handle the VMA mapping"); + pr_perror("Can't handle the VMA mapping"); return -ENOTSUP; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -607,16 +655,14 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) { - uint64_t size, gpu_addr_src, gpu_addr_dest, gpu_addr_ib; - uint64_t gpu_addr_src_orig, gpu_addr_dest_orig; - amdgpu_va_handle h_va_src, h_va_dest, h_va_ib; - amdgpu_bo_handle h_bo_src, h_bo_dest, h_bo_ib; + uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; + amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; + amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; struct amdgpu_bo_import_result res = { 0 }; - uint64_t copy_size, bytes_remain, j = 0; - uint64_t n_packets; struct amdgpu_cs_ib_info ib_info; amdgpu_bo_list_handle h_bo_list; struct amdgpu_cs_request cs_req; @@ -625,102 +671,100 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int err, shared_fd; + int j, err, shared_fd, packets_per_buffer; - shared_fd = bo_buckets[i].dmabuf_fd; - size = bo_buckets[i].size; + shared_fd = bo_bucket.dmabuf_fd; + size = bo_bucket.size; + buffer_bo_size = min(size, buffer_size); + packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; + src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; + dst_bo_size = (type == SDMA_OP_VRAM_READ) ? buffer_bo_size : size; plugin_log_msg("Enter %s\n", __func__); /* prepare src buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_src); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, src_bo_size, &h_bo_src); if (err) { pr_perror("failed to create userptr for sdma"); return -EFAULT; } - break; - case SDMA_OP_VRAM_READ: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); return -EFAULT; } - h_bo_src = res.buf_handle; break; - default: pr_perror("Invalid sdma operation"); return -EINVAL; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_src, &h_va_src, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, src_bo_size, 0x1000, 0, &gpu_addr_src, + &h_va_src, 0); if (err) { pr_perror("failed to alloc VA for src bo"); goto err_src_va; } - err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_src, 0, src_bo_size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the src BO"); goto err_src_bo_map; } - plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, size); + plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, src_bo_size); + /* prepare dest buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - - h_bo_dest = res.buf_handle; + h_bo_dst = res.buf_handle; break; - case SDMA_OP_VRAM_READ: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_dest); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, dst_bo_size, &h_bo_dst); if (err) { pr_perror("failed to create userptr for sdma"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } break; - default: pr_perror("Invalid sdma operation"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_dest, &h_va_dest, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, dst_bo_size, 0x1000, 0, &gpu_addr_dst, + &h_va_dst, 0); if (err) { pr_perror("failed to alloc VA for dest bo"); - goto err_dest_va; + goto err_dst_va; } - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, dst_bo_size, gpu_addr_dst, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the dest BO"); - goto err_dest_bo_map; + goto err_dst_bo_map; } - plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dest, size); + plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dst, dst_bo_size); - n_packets = (size + max_copy_size) / max_copy_size; /* prepare ring buffer/indirect buffer for command submission * each copy packet is 7 dwords so we need to alloc 28x size for ib */ - err = alloc_and_map(h_dev, n_packets * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, + err = alloc_and_map(h_dev, packets_per_buffer * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, (void **)&ib); if (err) { pr_perror("failed to allocate and map ib/rb"); goto err_ib_gpu_alloc; } - - plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, n_packets * 28); + plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, packets_per_buffer * 28); resources[0] = h_bo_src; - resources[1] = h_bo_dest; + resources[1] = h_bo_dst; resources[2] = h_bo_ib; err = amdgpu_bo_list_create(h_dev, 3, resources, NULL, &h_bo_list); if (err) { @@ -728,103 +772,122 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am goto err_bo_list; } - memset(&cs_req, 0, sizeof(cs_req)); - memset(&fence, 0, sizeof(fence)); - memset(ib, 0, n_packets * 28); - - plugin_log_msg("setting up sdma packets for command submission\n"); bytes_remain = size; - gpu_addr_src_orig = gpu_addr_src; - gpu_addr_dest_orig = gpu_addr_dest; + if (type == SDMA_OP_VRAM_WRITE) + copy_dst = gpu_addr_dst; + else + copy_src = gpu_addr_src; + while (bytes_remain > 0) { - copy_size = min(bytes_remain, max_copy_size); - - ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); - ib[j++] = copy_size; - ib[j++] = 0; - ib[j++] = 0xffffffff & gpu_addr_src; - ib[j++] = (0xffffffff00000000 & gpu_addr_src) >> 32; - ib[j++] = 0xffffffff & gpu_addr_dest; - ib[j++] = (0xffffffff00000000 & gpu_addr_dest) >> 32; - - gpu_addr_src += copy_size; - gpu_addr_dest += copy_size; - bytes_remain -= copy_size; - } - - gpu_addr_src = gpu_addr_src_orig; - gpu_addr_dest = gpu_addr_dest_orig; - plugin_log_msg("pad the IB to align on 8 dw boundary\n"); - /* pad the IB to the required number of dw with SDMA_NOP */ - while (j & 7) - ib[j++] = SDMA_NOP; - - ib_info.ib_mc_address = gpu_addr_ib; - ib_info.size = j; - - cs_req.ip_type = AMDGPU_HW_IP_DMA; - /* possible future optimization: may use other rings, info available in - * amdgpu_query_hw_ip_info() - */ - cs_req.ring = 0; - cs_req.number_of_ibs = 1; - cs_req.ibs = &ib_info; - cs_req.resources = h_bo_list; - cs_req.fence_info.handle = NULL; - - plugin_log_msg("create the context\n"); - err = amdgpu_cs_ctx_create(h_dev, &h_ctx); - if (err) { - pr_perror("failed to create context for SDMA command submission"); - goto err_ctx; - } + memset(&cs_req, 0, sizeof(cs_req)); + memset(&fence, 0, sizeof(fence)); + memset(ib, 0, packets_per_buffer * 28); + + if (type == SDMA_OP_VRAM_WRITE) { + err = read_fp(storage_fp, buffer, min(bytes_remain, buffer_bo_size)); + if (err) { + pr_perror("failed to read from storage"); + goto err_bo_list; + } + } - plugin_log_msg("initiate sdma command submission\n"); - err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); - if (err) { - pr_perror("failed to submit command for SDMA IB"); - goto err_cs_submit_ib; - } + buffer_space_remain = buffer_bo_size; + if (type == SDMA_OP_VRAM_WRITE) + copy_src = gpu_addr_src; + else + copy_dst = gpu_addr_dst; + j = 0; + + while (bytes_remain > 0 && buffer_space_remain > 0) { + copy_size = min(min(bytes_remain, max_copy_size), buffer_space_remain); + + ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); + ib[j++] = copy_size; + ib[j++] = 0; + ib[j++] = 0xffffffff & copy_src; + ib[j++] = (0xffffffff00000000 & copy_src) >> 32; + ib[j++] = 0xffffffff & copy_dst; + ib[j++] = (0xffffffff00000000 & copy_dst) >> 32; + + copy_src += copy_size; + copy_dst += copy_size; + bytes_remain -= copy_size; + buffer_space_remain -= copy_size; + } + /* pad the IB to the required number of dw with SDMA_NOP */ + while (j & 7) + ib[j++] = SDMA_NOP; - fence.context = h_ctx; - fence.ip_type = AMDGPU_HW_IP_DMA; - fence.ip_instance = 0; - fence.ring = 0; - fence.fence = cs_req.seq_no; - err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); - if (err) { - pr_perror("failed to query fence status"); - goto err_cs_submit_ib; - } + ib_info.ib_mc_address = gpu_addr_ib; + ib_info.size = j; - if (!expired) { - pr_err("IB execution did not complete\n"); - err = -EBUSY; - goto err_cs_submit_ib; - } + cs_req.ip_type = AMDGPU_HW_IP_DMA; + /* possible future optimization: may use other rings, info available in + * amdgpu_query_hw_ip_info() + */ + cs_req.ring = 0; + cs_req.number_of_ibs = 1; + cs_req.ibs = &ib_info; + cs_req.resources = h_bo_list; + cs_req.fence_info.handle = NULL; - plugin_log_msg("done querying fence status\n"); + err = amdgpu_cs_ctx_create(h_dev, &h_ctx); + if (err) { + pr_perror("failed to create context for SDMA command submission"); + goto err_ctx; + } + err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); + if (err) { + pr_perror("failed to submit command for SDMA IB"); + goto err_cs_submit_ib; + } + + fence.context = h_ctx; + fence.ip_type = AMDGPU_HW_IP_DMA; + fence.ip_instance = 0; + fence.ring = 0; + fence.fence = cs_req.seq_no; + err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); + if (err) { + pr_perror("failed to query fence status"); + goto err_cs_submit_ib; + } + if (!expired) { + pr_err("IB execution did not complete\n"); + err = -EBUSY; + goto err_cs_submit_ib; + } + + if (type == SDMA_OP_VRAM_READ) { + err = write_fp(storage_fp, buffer, buffer_bo_size - buffer_space_remain); + if (err) { + pr_perror("failed to write out to storage"); + goto err_cs_submit_ib; + } + } err_cs_submit_ib: - amdgpu_cs_ctx_free(h_ctx); + amdgpu_cs_ctx_free(h_ctx); + if (err) + break; + } err_ctx: amdgpu_bo_list_destroy(h_bo_list); err_bo_list: - free_and_unmap(n_packets * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); + free_and_unmap(packets_per_buffer * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); err_ib_gpu_alloc: - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_UNMAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, size, gpu_addr_dst, 0, AMDGPU_VA_OP_UNMAP); if (err) - pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dest, size); -err_dest_bo_map: - err = amdgpu_va_range_free(h_va_dest); + pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dst, size); +err_dst_bo_map: + err = amdgpu_va_range_free(h_va_dst); if (err) pr_perror("dest range free failed"); -err_dest_va: - err = amdgpu_bo_free(h_bo_dest); +err_dst_va: + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); - -err_dest_bo_prep: +err_dst_bo_prep: err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_UNMAP); if (err) pr_perror("failed to GPU unmap the src BO %lx, size = %lx", gpu_addr_src, size); @@ -836,7 +899,6 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am err = amdgpu_bo_free(h_bo_src); if (err) pr_perror("src bo free failed"); - plugin_log_msg("Leaving sdma_copy_bo, err = %d\n", err); return err; } @@ -845,19 +907,18 @@ void *dump_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - BoEntry **bo_info = thread_data->bo_entries; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; - size_t max_bo_size = 0, image_size = 0; + size_t max_bo_size = 0, image_size = 0, buffer_size; uint64_t max_copy_size; uint32_t major, minor; int num_bos = 0; int i, ret = 0; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -884,10 +945,11 @@ void *dump_bo_contents(void *_thread_data) } } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -910,19 +972,16 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ); if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; } - plugin_log_msg("** Successfully drained the BO using sDMA: bo_buckets[%d] **\n", i); - ret = write_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - break; } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -939,19 +998,18 @@ void *restore_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - size_t image_size = 0, total_bo_size = 0, max_bo_size = 0; - BoEntry **bo_info = thread_data->bo_entries; + size_t image_size = 0, total_bo_size = 0, max_bo_size = 0, buffer_size; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; uint64_t max_copy_size; uint32_t major, minor; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; int num_bos = 0; int i, ret = 0; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -977,7 +1035,6 @@ void *restore_bo_contents(void *_thread_data) goto exit; } - /* Allocate buffer to fit biggest BO */ for (i = 0; i < thread_data->num_of_bos; i++) { if (bo_buckets[i].gpu_id == thread_data->gpu_id && (bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) { @@ -989,17 +1046,17 @@ void *restore_bo_contents(void *_thread_data) } if (total_bo_size != image_size) { - pr_err("amdgpu_plugin: %s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, - total_bo_size); + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, total_bo_size); ret = -EINVAL; goto exit; } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -1013,11 +1070,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = read_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - goto exit; - - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -1026,7 +1080,7 @@ void *restore_bo_contents(void *_thread_data) } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -1054,9 +1108,9 @@ int check_hsakmt_shared_mem(uint64_t *shared_mem_size, uint32_t *shared_mem_magi /* First 4 bytes of shared file is the magic */ ret = read_file(HSAKMT_SHM_PATH, shared_mem_magic, sizeof(*shared_mem_magic)); if (ret) - pr_perror("amdgpu_plugin: Failed to read shared mem magic"); + pr_perror("Failed to read shared mem magic"); else - plugin_log_msg("amdgpu_plugin: Shared mem magic:0x%x\n", *shared_mem_magic); + plugin_log_msg("Shared mem magic:0x%x\n", *shared_mem_magic); return 0; } @@ -1071,7 +1125,7 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; if (!stat(HSAKMT_SHM_PATH, &st)) { - pr_debug("amdgpu_plugin: %s already exists\n", HSAKMT_SHM_PATH); + pr_debug("%s already exists\n", HSAKMT_SHM_PATH); } else { pr_info("Warning:%s was missing. Re-creating new file but we may lose perf counters\n", HSAKMT_SHM_PATH); @@ -1079,14 +1133,14 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha ret = ftruncate(fd, shared_mem_size); if (ret < 0) { - pr_err("amdgpu_plugin: Failed to truncate shared mem %s\n", HSAKMT_SHM); + pr_err("Failed to truncate shared mem %s\n", HSAKMT_SHM); close(fd); return -errno; } ret = write(fd, &shared_mem_magic, sizeof(shared_mem_magic)); if (ret != sizeof(shared_mem_magic)) { - pr_perror("amdgpu_plugin: Failed to restore shared mem magic"); + pr_perror("Failed to restore shared mem magic"); close(fd); return -errno; } @@ -1112,7 +1166,7 @@ static int unpause_process(int fd) ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to unpause process"); + pr_perror("Failed to unpause process"); goto exit; } @@ -1254,7 +1308,7 @@ bool kernel_supports_criu(int fd) } if (kmtIoctl(fd, AMDKFD_IOC_GET_VERSION, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call get version ioctl"); + pr_perror("Failed to call get version ioctl"); ret = false; goto exit; } @@ -1262,8 +1316,8 @@ bool kernel_supports_criu(int fd) pr_debug("Kernel IOCTL version:%d.%02d\n", args.major_version, args.minor_version); if (args.major_version != KFD_IOCTL_MAJOR_VERSION || args.minor_version < MIN_KFD_IOCTL_MINOR_VERSION) { - pr_err("amdgpu_plugin: CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", - args.major_version, args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); + pr_err("CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", args.major_version, + args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); ret = false; goto exit; } @@ -1286,13 +1340,13 @@ int amdgpu_plugin_dump_file(int fd, int id) size_t len; if (fstat(fd, &st) == -1) { - pr_perror("amdgpu_plugin: fstat error"); + pr_perror("fstat error"); return -1; } ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { - pr_perror("amdgpu_plugin: fstat error for /dev/kfd"); + pr_perror("fstat error for /dev/kfd"); return -1; } @@ -1317,12 +1371,11 @@ int amdgpu_plugin_dump_file(int fd, int id) CriuRenderNode rd = CRIU_RENDER_NODE__INIT; struct tp_node *tp_node; - pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), - fd, id); + pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id); tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); if (!tp_node) { - pr_err("amdgpu_plugin: Failed to find a device with minor number = %d\n", minor(st.st_rdev)); + pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev)); return -ENODEV; } @@ -1350,7 +1403,7 @@ int amdgpu_plugin_dump_file(int fd, int id) return ret; } - pr_info("amdgpu_plugin: %s : %s() called for fd = %d\n", CR_PLUGIN_DESC.name, __func__, major(st.st_rdev)); + pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); /* KFD only allows ioctl calls from the same process that opened the KFD file descriptor. * The existing /dev/kfd file descriptor that is passed in is only allowed to do IOCTL calls with @@ -1362,13 +1415,13 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_PROCESS_INFO; if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call process info ioctl"); + pr_perror("Failed to call process info ioctl"); ret = -1; goto exit; } - pr_info("amdgpu_plugin: devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, - args.num_objects, args.priv_data_size); + pr_info("devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, args.num_objects, + args.priv_data_size); e = xmalloc(sizeof(*e)); if (!e) { @@ -1401,7 +1454,7 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_CHECKPOINT; ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to call dumper (process) ioctl"); + pr_perror("Failed to call dumper (process) ioctl"); goto exit; } @@ -1423,11 +1476,11 @@ int amdgpu_plugin_dump_file(int fd, int id) goto exit; snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); - pr_info("amdgpu_plugin: img_path = %s\n", img_path); + pr_info("img_path = %s\n", img_path); len = criu_kfd__get_packed_size(e); - pr_info("amdgpu_plugin: Len = %ld\n", len); + pr_info("Len = %ld\n", len); buf = xmalloc(len); if (!buf) { @@ -1453,9 +1506,9 @@ int amdgpu_plugin_dump_file(int fd, int id) free_e(e); if (ret) - pr_err("amdgpu_plugin: Failed to dump (ret:%d)\n", ret); + pr_err("Failed to dump (ret:%d)\n", ret); else - pr_info("amdgpu_plugin: Dump successful\n"); + pr_info("Dump successful\n"); return ret; } @@ -1501,10 +1554,10 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) device_bucket->drm_fd = node_get_drm_render_device(tp_node); if (device_bucket->drm_fd < 0) { - pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver"); + pr_perror("Can't pass NULL drm render fd to driver"); goto exit; } else { - pr_info("amdgpu_plugin: passing drm render fd = %d to driver\n", device_bucket->drm_fd); + pr_info("passing drm render fd = %d to driver\n", device_bucket->drm_fd); } } @@ -1588,7 +1641,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf vma_md->new_pgoff = bo_bucket->restored_offset; vma_md->fd = node_get_drm_render_device(tp_node); - plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx " + plugin_log_msg("adding vma_entry:addr:0x%lx old-off:0x%lx " "new_off:0x%lx new_minor:%d\n", vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor); @@ -1669,7 +1722,7 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; - pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id); + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1713,7 +1766,7 @@ int amdgpu_plugin_restore_file(int id) } fclose(img_fp); - pr_info("amdgpu_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id); + pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id); target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id); if (!target_gpu_id) { @@ -1727,11 +1780,11 @@ int amdgpu_plugin_restore_file(int id) goto fail; } - pr_info("amdgpu_plugin: render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); + pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); if (fd < 0) - pr_err("amdgpu_plugin: Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1743,7 +1796,12 @@ int amdgpu_plugin_restore_file(int id) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - return dup(fd); + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1752,7 +1810,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - pr_info("amdgpu_plugin: Opened kfd, fd = %d\n", fd); + pr_info("Opened kfd, fd = %d\n", fd); if (!kernel_supports_criu(fd)) return -ENOTSUP; @@ -1780,7 +1838,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - plugin_log_msg("amdgpu_plugin: read image file data\n"); + plugin_log_msg("read image file data\n"); /* * Initialize fd_next to be 1 greater than the biggest file descriptor in use by the target restore process. @@ -1847,10 +1905,10 @@ int amdgpu_plugin_restore_file(int id) xfree(buf); if (ret) { - pr_err("amdgpu_plugin: Failed to restore (ret:%d)\n", ret); + pr_err("Failed to restore (ret:%d)\n", ret); fd = ret; } else { - pr_info("amdgpu_plugin: Restore successful (fd:%d)\n", fd); + pr_info("Restore successful (fd:%d)\n", fd); } return fd; @@ -1870,7 +1928,7 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; - plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__); + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1903,13 +1961,18 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) { *new_offset = vma_md->new_pgoff; - if (is_renderD) - *updated_fd = vma_md->fd; - else - *updated_fd = -1; + *updated_fd = -1; + if (is_renderD) { + int fd = dup(vma_md->fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + *updated_fd = fd; + } - plugin_log_msg("amdgpu_plugin: old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, - vma_md->new_pgoff, *updated_fd); + plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, + *updated_fd); return 1; } @@ -1924,7 +1987,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) struct kfd_ioctl_criu_args args = { 0 }; int fd, ret = 0; - pr_info("amdgpu_plugin: Inside %s for target pid = %d\n", __func__, target_pid); + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { @@ -1934,7 +1997,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) args.pid = target_pid; args.op = KFD_CRIU_OP_RESUME; - pr_info("amdgpu_plugin: Calling IOCTL to start notifiers and queues\n"); + pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { pr_perror("restore late ioctl failed"); ret = -1; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 42689933ee..ef79e5ef42 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -1063,7 +1063,7 @@ static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest) * * Nodes compatibility are determined by: * 1. Comparing the node properties - * 2. Making sure iolink mappings to CPUs would be compabitle with existing iolink mappings in maps + * 2. Making sure iolink mappings to CPUs would be compatible with existing iolink mappings in maps * * If src_node and dest_node are mappable, then map_device will push the new mapping * for src_node -> dest_node into new_maps. @@ -1241,7 +1241,7 @@ static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, s return true; } else { /* We could not map remaining nodes in the list. Add dest node back - * to list and try to map next dest ndoe in list to current src + * to list and try to map next dest node in list to current src * node. */ pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, " diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index b88fe20cfe..e1ebb75a3a 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,7 +23,7 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include /* diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index af1858ab58..2c58c910e7 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -23,6 +23,7 @@ RUN apk update && apk add \ python3 \ sudo \ libcap-utils \ + libdrm-dev \ util-linux COPY . /criu @@ -32,6 +33,7 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date RUN apk add \ ip6tables \ iptables \ + iptables-legacy \ nftables \ iproute2 \ tar \ @@ -46,6 +48,6 @@ RUN apk add \ # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml +RUN pip3 install junit_xml --break-system-packages RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c0d181b039..c466a73d2d 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -55,7 +55,6 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta protobuf-compiler \ python-protobuf \ python3-minimal \ - python3-future \ python-ipaddress \ curl \ wget \ diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index f2bce1e5ba..b9968e876b 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -35,6 +35,7 @@ RUN pacman -Syu --noconfirm \ asciidoctor \ python-junit-xml \ python-importlib-metadata \ + libdrm \ diffutils COPY . /criu diff --git a/scripts/build/Dockerfile.centos7 b/scripts/build/Dockerfile.centos7 deleted file mode 100644 index 21e70ff0eb..0000000000 --- a/scripts/build/Dockerfile.centos7 +++ /dev/null @@ -1,45 +0,0 @@ -FROM centos:7 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm -RUN yum install -y \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - protobuf-python \ - python \ - python-flake8 \ - python-ipaddress \ - python2-future \ - python2-junit_xml \ - python-yaml \ - python-six \ - sudo \ - tar \ - which \ - e2fsprogs \ - python2-pip \ - rubygem-asciidoctor - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index 488f95d650..b065246744 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -28,7 +28,6 @@ RUN yum install -y --allowerasing \ python3-devel \ python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-pip \ sudo \ diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 350102818b..0318f650f3 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ @@ -31,4 +30,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index a15038631c..13e9926424 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -21,8 +21,7 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python3-minimal \ - python3-future + python3-minimal COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 23db14e8df..c2cf20a36b 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index e0e72372d9..9b53a76aab 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -27,11 +27,9 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python-is-python3 \ python3-minimal \ python3-protobuf \ - python3-yaml \ - python3-future + python3-yaml COPY . /criu WORKDIR /criu diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 30dd9ebeb8..1caa1e4235 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -20,14 +20,6 @@ export CONTAINER_RUNTIME alpine: ZDTM_OPTS=-x zdtm/static/binfmt_misc -x zdtm/static/sched_policy00 -define DOCKER_JSON -{ - "storage-driver": "devicemapper" -} -endef - -export DOCKER_JSON - ifeq ($(GITHUB_ACTIONS),true) # GitHub Actions does not give us a real TTY and errors out with # 'the input device is not a TTY' if using '-t' @@ -47,34 +39,20 @@ else endif ifeq ($(CONTAINER_RUNTIME),podman) - # Just as Docker needs to use devicemapper Podman needs vfs - # as graphdriver as overlayfs does not support all test cases - STORAGE_DRIVER := vfs # Podman limits the number of processes in a container using cgroups. # Disable it as it breaks the thread-bomb test CONTAINER_OPTS += --pids-limit=0 endif -export STORAGE_DRIVER - -restart-docker: - if [ "$$UNAME" = "x86_64" ] && [ "$$CONTAINER_RUNTIME" = "docker" ]; then \ - echo "$$DOCKER_JSON" > /etc/docker/daemon.json; \ - cat /etc/docker/daemon.json; \ - systemctl status docker; \ - systemctl restart docker; \ - systemctl status docker; \ - fi - export ZDTM_OPTS -$(TARGETS): restart-docker +$(TARGETS): $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run --env-file docker.env $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh + $(CONTAINER_RUNTIME) run --env-file docker.env -v `pwd`/../../:/criu $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh -fedora-asan: restart-docker +fedora-asan: $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) + $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) -v `pwd`/../../:/criu criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) docker-test: ./docker-test.sh @@ -82,10 +60,7 @@ docker-test: podman-test: ./podman-test.sh -# overlayfs behaves differently on Ubuntu and breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 -# Switch to devicemapper -java-test: restart-docker +java-test: ./java-test.sh setup-vagrant: @@ -102,5 +77,18 @@ vagrant-fedora-non-root: setup-vagrant .PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root +check-commit: + ($(MAKE) -j $$(nproc) -C ../.. && \ + echo "Commit $$(git rev-parse --short HEAD) built successfully") || \ + (echo "Build failed for $$(git rev-list -n 1 --pretty HEAD)" && \ + exit 1) + +.PHONY: check-commit + +loongarch64-qemu-test: + ./loongarch64-qemu-test.sh + +.PHONY: loongarch64-qemu-test + %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/apt-install b/scripts/ci/apt-install index 45aca13f40..676e0f7949 100755 --- a/scripts/ci/apt-install +++ b/scripts/ci/apt-install @@ -15,7 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends "$@" && break + apt-get update -y && apt-get install -y --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index deeeca0b9d..8b72fa5f1a 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -4,6 +4,9 @@ set -x cat /proc/self/mountinfo +time make ASAN=1 -j 4 V=1 +time make -j4 -C test/zdtm V=1 + chmod 0777 test chmod 0777 test/zdtm/transition/ chmod 0777 test/zdtm/static diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index beb7da6da6..7e7ef71973 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -15,10 +15,11 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce - -# shellcheck source=/dev/null -. /etc/lsb-release +# checkpoint/restore is broken in Docker Engine (Community) version 25.0.0-beta.1 +# https://github.com/moby/moby/discussions/46816 +# Downgrade to the latest stable version. +VERSION_STRING=5:24.0.7-1~ubuntu.20.04~focal +./apt-install docker-ce=$VERSION_STRING docker-ce-cli=$VERSION_STRING containerd.io docker-buildx-plugin docker-compose-plugin # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json @@ -28,6 +29,9 @@ CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + export SKIP_CI_TEST=1 ./run-ci-tests.sh @@ -84,27 +88,25 @@ print_logs () { } declare -i max_restore_container_tries=3 -current_iteration= restore_container () { CHECKPOINT_NAME=$1 - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { + for i in $(seq $max_restore_container_tries); do + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log && break + # FIXME: There is a race condition in docker/containerd that causes # docker to occasionally fail when starting a container from a # checkpoint immediately after the checkpoint has been created. # https://github.com/moby/moby/issues/42900 - if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then + if grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log; then + echo "Retry container restore: $i/$max_restore_container_tries" + sleep 1; + else print_logs fi - grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && { - ((current_iteration+=1)) - echo "Retry container restore: $current_iteration" - sleep 1; - restore_container "$CHECKPOINT_NAME" - } || - print_logs - } && current_iteration=0 + + done } # Scenario: Create multiple containers and checkpoint and restore them once diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh new file mode 100755 index 0000000000..d5646468e8 --- /dev/null +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -x + +./apt-install \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common \ + sshpass \ + openssh-client + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable test" + +./apt-install docker-ce + +# shellcheck source=/dev/null +. /etc/lsb-release + +# docker checkpoint and restore is an experimental feature +echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart + +docker info + +# run a loongarch64 vm + +PORT='2222' +USER='root' +PASSWORD='loongarch64' +NAME='vm' + +docker run \ + -d \ + --net host \ + --name $NAME \ + merore/archlinux-loongarch64 + +run() { + if [ -z "$1" ]; then + echo "Command cannot be empty." + exit 1 + fi + sshpass -p $PASSWORD ssh -o StrictHostKeyChecking=no -p $PORT $USER@127.0.0.1 "$1" +} + +# wait vm to start +while (! run "uname -a") +do + echo "Wait vm to start..." + sleep 1 +done +echo "The loongarch64 vm is started!" + +# Tar criu and send to vm +tar -cf criu.tar ../../../criu +sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127.0.0.1:/root + +# build and test +run 'cd /root; tar -xf criu.tar' +run 'cd /root/criu; make -j4 && make -j4 -C test/zdtm' +run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 687acb8ff5..72ad59a501 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -17,6 +17,9 @@ mkdir -p /etc/criu echo "manage-cgroups ignore" > /etc/criu/runc.conf sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 7c62aaaa2c..d812c5faa5 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -18,13 +18,13 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libbsd-devel \ + libselinux-utils \ make \ procps-ng \ protobuf-c-devel \ protobuf-devel \ python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-junit_xml \ python3-pip \ @@ -36,6 +36,7 @@ dnf install -y \ which \ e2fsprogs \ rubygem-asciidoctor \ + libdrm-devel \ kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 229de97c1c..ef7e869e03 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -5,8 +5,8 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml - libperl-dev pkg-config python3-future python3-protobuf - python3-pip python3-importlib-metadata python3-junit.xml) + libperl-dev pkg-config python3-protobuf python3-pip + python3-importlib-metadata python3-junit.xml libdrm-dev) X86_64_PKGS=(gcc-multilib) @@ -58,10 +58,6 @@ ci_prep () { scripts/ci/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" - - # zdtm uses an unversioned python binary to run the tests. - # let's point python to python3 - ln -sf /usr/bin/python3 /usr/bin/python } test_stream() { @@ -260,6 +256,7 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/criu-ns/ run make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run @@ -291,14 +288,27 @@ ip net add test # Rootless tests # Check if cap_checkpoint_restore is supported and also if unshare -c is supported. -if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then +# +# Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). +# This is a temporary workaround until fixed in the kernel. +# The kernel currently does not show correct device and inode numbers in /proc/pid/maps +# for stackable file systems. +if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true && [ ! -e /run/.containerenv ]; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then + # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. + selinuxmode=$(getenforce) + setenforce Permissive + fi # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then + setenforce "$selinuxmode" + fi setcap -r criu/criu else echo "Skipping unprivileged mode tests" @@ -319,6 +329,9 @@ make -C test/others/ns_ext run # config file parser and parameter testing make -C test/others/config-file run +# action script testing +make -C test/others/action-script run + # Skip all further tests when running with GCOV=1 # The one test which currently cannot handle GCOV testing is compel/test # Probably because the GCOV Makefile infrastructure does not exist in compel @@ -326,3 +339,8 @@ make -C test/others/config-file run # compel testing make -C compel/test + +# amdgpu_plugin testing +make amdgpu_plugin +make -C plugins/amdgpu/ test_topology_remap +./plugins/amdgpu/test_topology_remap diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 5cc8424423..c8cf0be744 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.2.19 -FEDORA_VERSION=37 -FEDORA_BOX_VERSION=37.20221105.0 +VAGRANT_VERSION=2.3.7 +FEDORA_VERSION=38 +FEDORA_BOX_VERSION=38.20230413.1 setup() { if [ -n "$TRAVIS" ]; then @@ -19,7 +19,7 @@ setup() { # Tar up the git checkout to have vagrant rsync it to the VM tar cf criu.tar ../../../criu # Cirrus has problems with the following certificate. - wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_"$(uname -m)".deb -O /tmp/vagrant.deb && \ + wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ @@ -38,7 +38,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-future python3-protobuf python3-importlib-metadata \ + protobuf-devel python3-flake8 python3-protobuf python3-importlib-metadata \ python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd @@ -57,6 +57,11 @@ fedora-no-vdso() { } fedora-rawhide() { + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks + # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously + # installed this reboots the VM. + vagrant reload + ssh default uname -a # # Workaround the problem: # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected @@ -65,6 +70,10 @@ fedora-rawhide() { # ssh default 'sudo dnf remove -y crun || true' ssh default sudo dnf install -y podman runc + # Some tests in the container need selinux to be disabled. + # In the container it is not possible to change the state of selinux. + # Let's just disable it for this test run completely. + ssh default 'sudo setenforce Permissive' ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } diff --git a/scripts/criu-ns b/scripts/criu-ns index d51e7772c0..4c032aa140 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -6,6 +6,7 @@ import sys import os import fcntl import termios +import time # constants for unshare CLONE_NEWNS = 0x00020000 @@ -70,7 +71,19 @@ def _wait_for_process_status(criu_pid): try: (pid, status) = os.wait() if pid == criu_pid: - return os.waitstatus_to_exitcode(status) + # The following code block is based on + # os.waitstatus_to_exitcode() introduced in Python 3.9 + # and we implement this for comparability with older + # versions of Python. + if os.WIFSIGNALED(status): + return os.WTERMSIG(status) + elif os.WIFEXITED(status): + return os.WEXITSTATUS(status) + elif os.WIFSTOPPED(status): + return os.WSTOPSIG(status) + else: + raise Exception("CRIU was terminated by an " + "unidentified reason") except OSError: return -251 @@ -80,8 +93,21 @@ def run_criu(args): Spawn CRIU binary """ print(sys.argv) - os.execlp('criu', *['criu'] + args) - raise OSError(errno.ENOENT, "No such command") + + if "--criu-binary" in args: + try: + opt_index = args.index("--criu-binary") + path = args[opt_index + 1] + del args[opt_index:opt_index + 2] + args.insert(0, "criu") + os.execv(path, args) + raise OSError(errno.ENOENT, "No such command") + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--criu-binary missing argument") + else: + args.insert(0, "criu") + os.execvp("criu", args) + raise OSError(errno.ENOENT, "No such command") # pidns_holder creates a process that is reparented to the init. @@ -110,8 +136,8 @@ def wrap_restore(): if '--restore-sibling' in restore_args: raise OSError(errno.EINVAL, "--restore-sibling is not supported") - # Unshare pid and mount namespaces - if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: + # Unshare pid namespace + if _unshare(CLONE_NEWPID) != 0: _errno = ctypes.get_errno() raise OSError(_errno, errno.errorcode[_errno]) @@ -123,8 +149,32 @@ def wrap_restore(): restore_detached = True restore_args.remove('--restore-detached') + restore_pidfile = None + if '--pidfile' in restore_args: + try: + opt_index = restore_args.index('--pidfile') + restore_pidfile = restore_args[opt_index + 1] + del restore_args[opt_index:opt_index + 2] + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--pidfile missing argument") + + if not restore_pidfile.startswith('/'): + for base_dir_opt in ['--work-dir', '-W', '--images-dir', '-D']: + if base_dir_opt in restore_args: + try: + opt_index = restore_args.index(base_dir_opt) + restore_pidfile = os.path.join(restore_args[opt_index + 1], restore_pidfile) + break + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, base_dir_opt + " missing argument") + criu_pid = os.fork() if criu_pid == 0: + # Unshare mount namespace + if _unshare(CLONE_NEWNS) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + os.setsid() # Set stdin tty to be a controlling tty of our new session, this is # required by --shell-job option, as for it CRIU would try to set a @@ -139,6 +189,25 @@ def wrap_restore(): _mount_new_proc() run_criu(restore_args) + if restore_pidfile: + restored_pid = None + retry = 5 + + while not restored_pid and retry: + with open('/proc/%d/task/%d/children' % (criu_pid, criu_pid)) as f: + line = f.readline().strip() + if len(line): + restored_pid = line + break + retry -= 1 + time.sleep(1) + + if restored_pid: + with open(restore_pidfile, 'w+') as f: + f.write(restored_pid) + else: + print("Warn: Search of restored pid for --pidfile option timeouted") + if restore_detached: return 0 diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index b80175f05b..5b6037d619 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -8,7 +8,7 @@ URL="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/.c curl -s "${URL}" | sed -e " s,^\( *\)#\([A-Z]\),\1\2,g; s,ControlStatements,ControlStatementsExceptForEachMacros,g; - s,ColumnLimit: 80,ColumnLimit: 120,g; + s,ColumnLimit: 80,ColumnLimit: 0,g; s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_bit',g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; diff --git a/scripts/github-indent-warnings.py b/scripts/github-indent-warnings.py new file mode 100755 index 0000000000..04f82d6c11 --- /dev/null +++ b/scripts/github-indent-warnings.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +import sys +import re + +re_file = r'^diff --git a/(\S\S*)\s.*$' +re_line = r'^@@ -(\d\d*)\D.*@@.*$' + +if __name__ == '__main__': + if len(sys.argv) != 1 and len(sys.argv) != 2: + print(f'usage: {sys.argv[0]} ') + print(f'usage: | {sys.argv[0]}') + exit(1) + + input_file = sys.stdin.fileno() + if len(sys.argv) == 2: + input_file = sys.argv[1] + + with open(input_file, 'r') as fi: + file_name = None + line_number = None + for line in fi: + file_matches = re.findall(re_file, line) + if len(file_matches) == 1: + file_name = file_matches[0] + continue + + if file_name is None: + continue + + line_matches = re.findall(re_line, line) + if len(line_matches) == 1: + line_number = int(line_matches[0]) + 3 + print(f'::warning file={file_name},line={line_number}::clang-format: Possible coding style problem (https://github.com/checkpoint-restore/criu/blob/criu-dev/CONTRIBUTING.md#automatic-tools-to-fix-coding-style)') diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh index 540c2c0949..8be49c7871 100755 --- a/scripts/install-debian-pkgs.sh +++ b/scripts/install-debian-pkgs.sh @@ -15,7 +15,7 @@ function print_help() function process() { sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' ${REQ_PKGS} )" + sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" } if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then diff --git a/scripts/magic-gen.py b/scripts/magic-gen.py index 3b1f29fb52..38dff1424a 100755 --- a/scripts/magic-gen.py +++ b/scripts/magic-gen.py @@ -1,4 +1,4 @@ -#!/bin/env python2 +#!/bin/env python3 import sys diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index c1c1e94af4..55c5be307f 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -20,7 +20,8 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ - -e s/aarch64.*/aarch64/) + -e s/aarch64.*/aarch64/ \ + -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH diff --git a/scripts/nmk/scripts/main.mk b/scripts/nmk/scripts/main.mk index 493a164f88..7f11bda236 100644 --- a/scripts/nmk/scripts/main.mk +++ b/scripts/nmk/scripts/main.mk @@ -1,7 +1,7 @@ ifndef ____nmk_defined__main # -# Genaral inclusion statement +# General inclusion statement ifndef ____nmk_defined__include include $(__nmk_dir)include.mk diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 1681d4e909..724204a03c 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null || which python2 2>/dev/null) +FULL_PYTHON := $(shell which python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ diff --git a/soccr/soccr.c b/soccr/soccr.c index abea937033..8e1ce1c633 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -781,7 +781,7 @@ int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsi return 0; } -static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) +static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __u32 len) { int ret, err = -1, max_chunk; int off; @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) continue; } - logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", queue, ret, chunk, len, max_chunk); + logerr("Can't restore %s queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); goto err; } off += ret; @@ -837,7 +837,7 @@ static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) return -1; } - return __send_queue(sk, queue, buf, len); + return __send_queue(sk, queue == TCP_RECV_QUEUE ? "recv" : "send", buf, len); } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, @@ -876,7 +876,7 @@ static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_dat * they can be restored without any tricks. */ tcp_repair_off(sk->fd); - if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) + if (__send_queue(sk, "not-sent send", buf + len, ulen)) return -3; if (tcp_repair_on(sk->fd)) return -4; diff --git a/soccr/test/Makefile b/soccr/test/Makefile index 4585400454..499901b0c5 100644 --- a/soccr/test/Makefile +++ b/soccr/test/Makefile @@ -21,7 +21,6 @@ tcp-conn-v6: tcp-conn-v6.c test: tcp-constructor tcp-conn tcp-conn-v6 unshare -n sh -c "ip link set up dev lo; ./tcp-conn" unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" - python run.py ./$(RUN) + python3 run.py ./$(RUN) .PHONY: test - diff --git a/soccr/test/run.py b/soccr/test/run.py index 1ffe58a582..57c556e361 100644 --- a/soccr/test/run.py +++ b/soccr/test/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys, os import hashlib diff --git a/soccr/test/tcp-test.py b/soccr/test/tcp-test.py index ff3fe29dc2..b48f532eb0 100755 --- a/soccr/test/tcp-test.py +++ b/soccr/test/tcp-test.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 -from __future__ import print_function import sys, socket import hashlib diff --git a/test/Makefile b/test/Makefile index e8fcffe3fc..5784b6a495 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job skip-file-rwx-check +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job criu-ns skip-file-rwx-check other: for t in $(TESTS); do \ diff --git a/test/check_actions.py b/test/check_actions.py index 4973e39382..84d738dbb7 100755 --- a/test/check_actions.py +++ b/test/check_actions.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/crit-recode.py b/test/crit-recode.py index 4135681e11..f119271d8b 100755 --- a/test/crit-recode.py +++ b/test/crit-recode.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import pycriu import sys import os diff --git a/test/exhaustive/pipe.py b/test/exhaustive/pipe.py index 7f1c53d34b..afe20846a3 100755 --- a/test/exhaustive/pipe.py +++ b/test/exhaustive/pipe.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import os diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 5b4c972cb6..689b1fb3ae 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os @@ -462,7 +462,7 @@ def set_nonblock(sk): def chk_real_state(st): - # Before enything else -- check that we still have + # Before anything else -- check that we still have # all the sockets at hands for sk in st.sockets: if not sk.visible: diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 252778969d..27e2b7b155 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import ctypes libc = ctypes.CDLL(None) diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml index ddb6c89cf1..8a0b5bd4b3 100644 --- a/test/javaTests/pom.xml +++ b/test/javaTests/pom.xml @@ -18,6 +18,7 @@ test.xml + test-zero.xml diff --git a/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java b/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java index 860619c267..6c22c7260b 100644 --- a/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java +++ b/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java @@ -112,7 +112,7 @@ public void runtest(String testName, String checkpointOpt, String restoreOpt) th String pid; int exitCode; - System.out.println("======= Testing " + testName + " ========"); + System.out.println("======= Testing " + testName + " " + checkpointOpt + " ========"); testSetup(testName); diff --git a/test/javaTests/test-zero.xml b/test/javaTests/test-zero.xml new file mode 100644 index 0000000000..fe4d65471e --- /dev/null +++ b/test/javaTests/test-zero.xml @@ -0,0 +1,89 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 7f503e817a..4a6d55e6bf 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -9,7 +9,7 @@ prep ./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail # FIXME: fhandles looks broken on btrfs -grep -P "/.* / " /proc/self/mountinfo | grep -q btrfs || NOBTRFS=$? +findmnt --noheadings --target . | grep -q btrfs || NOBTRFS=$? if [ $NOBTRFS -eq 1 ] ; then ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail fi diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore new file mode 100644 index 0000000000..c0b6a2490a --- /dev/null +++ b/test/others/action-script/.gitignore @@ -0,0 +1 @@ +img-dir-* diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile new file mode 100644 index 0000000000..f1ce191dbc --- /dev/null +++ b/test/others/action-script/Makefile @@ -0,0 +1,5 @@ +run: + @make -C .. loop + ./run.sh + +.PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh new file mode 100755 index 0000000000..aba8292c05 --- /dev/null +++ b/test/others/action-script/action-script.sh @@ -0,0 +1,2 @@ +#!/bin/bash +touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh new file mode 100755 index 0000000000..a82fccf359 --- /dev/null +++ b/test/others/action-script/run.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -ebm + +# shellcheck source=test/others/env.sh +source ../env.sh || exit 1 + +SELFDIR="$(dirname "$(readlink -f "$0")")" +SCRIPT="$SELFDIR/action-script.sh" +IMGDIR="$SELFDIR/img-dir-$$" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +trap "cleanup" QUIT TERM INT HUP EXIT + +# shellcheck disable=SC2317 +# https://github.com/koalaman/shellcheck/issues/2660 +function cleanup() +{ + if [[ -n "$PID" ]]; then + kill -9 "$PID" + fi +} + +PID=$(../loop) +if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then + echo "Failed to checkpoint process $PID" + cat dump.log + kill -9 "$PID" + exit 1 +fi + +if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then + echo "CRIU restore failed" + echo FAIL + exit 1 +fi + +PID=$(cat "$IMGDIR"/test.pidfile) + +found_missing_file=false +hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") + +for hook in "${hooks[@]}" +do + if [ ! -e "$IMGDIR/action-hook-$hook" ]; then + echo "ERROR: action-hook-$hook does not exist" + found_missing_file=true + fi +done + +if [ "$found_missing_file" = true ]; then + exit 1 +fi + +echo PASS + +rm -rf "$IMGDIR" +exit 0 diff --git a/test/others/app-emu/java/HelloWorld/run.sh b/test/others/app-emu/java/HelloWorld/run.sh index 0ed6afd141..e6dcbd9fca 100644 --- a/test/others/app-emu/java/HelloWorld/run.sh +++ b/test/others/app-emu/java/HelloWorld/run.sh @@ -18,7 +18,7 @@ setsid java HelloWorld & pid=${!} -echo Lanuched java application with pid $pid in background +echo Launched java application with pid $pid in background ${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { echo "Dump failed" diff --git a/test/others/app-emu/make/run.sh b/test/others/app-emu/make/run.sh index 7cb44c7709..d871b7d9c4 100644 --- a/test/others/app-emu/make/run.sh +++ b/test/others/app-emu/make/run.sh @@ -28,7 +28,7 @@ setsid make -j4 & pid=${!} -echo Lanuched make in $pid background +echo Launched make in $pid background sleep 2 ${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 105aac72b4..2698bbd3c2 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -101,6 +101,8 @@ function run_test2 { ${CRIT} x ./ rss || exit 1 } +${CRIT} --version + gen_imgs run_test1 run_test2 diff --git a/test/others/criu-ns/Makefile b/test/others/criu-ns/Makefile new file mode 100644 index 0000000000..4d901a1116 --- /dev/null +++ b/test/others/criu-ns/Makefile @@ -0,0 +1,3 @@ +run: + @make -C ../.. zdtm_ct + ../../zdtm_ct run.py diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py new file mode 100755 index 0000000000..0a36438e80 --- /dev/null +++ b/test/others/criu-ns/run.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 + +import fcntl +import os +import pathlib +import pty +import shutil +import subprocess +import sys +import termios +import time + + +CRIU_BIN = "../../../criu/criu" +CRIU_NS = "../../../scripts/criu-ns" +IMG_DIR = "dumpdir" +DUMP_LOG = "dump.log" +RESTORE_LOG = "restore.log" +PIDFILE = "pidfile" + + +def check_dumpdir(path=IMG_DIR): + if os.path.isdir(path): + shutil.rmtree(path) + os.mkdir(path, 0o755) + + +def run_task_with_own_pty(task): + fd_m, fd_s = pty.openpty() + + pid = os.fork() + if pid == 0: + os.close(fd_m) + os.setsid() + os.dup2(fd_s, 0) + os.dup2(fd_s, 1) + os.dup2(fd_s, 2) + fcntl.ioctl(fd_s, termios.TIOCSCTTY, 1) + os.close(fd_s) + task() + exit(0) + + os.close(fd_s) + fd_m = os.fdopen(fd_m, "rb") + os.set_blocking(fd_m.fileno(), False) + + while True: + try: + data = fd_m.read() + except IOError: + break + if data is not None: + print(data.decode("utf-8")) + + _, status = os.waitpid(pid, 0) + + try: + data = fd_m.read() + except IOError as err: + print(err) + + if data is not None: + print(data.decode("utf-8")) + fd_m.close() + + if status != 0: + print("task %s exited badly: %d" % (task.__name__, status)) + exit(1) + + return 0 + + +def create_pty(): + fd_m, fd_s = pty.openpty() + return (os.fdopen(fd_m, "wb"), os.fdopen(fd_s, "wb")) + + +def create_isolated_dumpee(): + pathlib.Path("running").touch() + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + os.dup2(fd_s.fileno(), 0) + os.dup2(fd_s.fileno(), 1) + os.dup2(fd_s.fileno(), 2) + fcntl.ioctl(fd_s.fileno(), termios.TIOCSCTTY, 1) + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + fd_m.close() + fd_s.close() + return pid + + +def criu_ns_dump(pid, shell_job=False): + cmd = [CRIU_NS, "dump", "-D", IMG_DIR, "-v4", "-t", str(pid), + "--log-file", DUMP_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + ret = subprocess.Popen(cmd).wait() + return ret + + +def criu_ns_restore(shell_job=False, restore_detached=False): + cmd = [CRIU_NS, "restore", "-D", IMG_DIR, "-v4", "--log-file", + RESTORE_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + if restore_detached: + cmd += ["--restore-detached", "--pidfile", PIDFILE] + ret = subprocess.Popen(cmd).wait() + return ret + + +def read_log_file(filename): + logfile_path = os.path.join(IMG_DIR, filename) + with open(logfile_path) as logfile: + print(logfile.read()) + + +def test_dump_and_restore_with_shell_job(): + print("Test criu-ns dump and restore with --shell-job option") + check_dumpdir() + pathlib.Path("running").touch() + pid = os.fork() + if pid == 0: + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + + ret = criu_ns_dump(pid, shell_job=True) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + os.unlink("running") + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + fd_m.close() + # since criu-ns takes control of the tty stdin + os.dup2(fd_s.fileno(), 0) + ret = criu_ns_restore(shell_job=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + fd_s.close() + os.waitpid(pid, 0) + + +def test_dump_and_restore_without_shell_job(restore_detached=False): + print("Test criu-ns dump and restore with an isolated process" + "(%d)" % restore_detached) + check_dumpdir() + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + if not restore_detached: + os.unlink("running") + + pid = os.fork() + if pid == 0: + os.setsid() + ret = criu_ns_restore(restore_detached=restore_detached) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + os.waitpid(pid, 0) + + +def test_dump_and_restore_in_pidns(): + if os.system("grep NSpid /proc/self/status"): + return + + print("Test criu-ns dump and restore in namespaces") + + def _dump(): + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _restore(): + ret = criu_ns_restore(restore_detached=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + def _get_restored_pid(): + restored_pid = 0 + pidfile_path = os.path.join(IMG_DIR, PIDFILE) + if not os.path.exists(pidfile_path): + raise FileNotFoundError("pidfile not found") + with open(pidfile_path, "r") as pidfile: + restored_pid = pidfile.read().strip() + return int(restored_pid) + + def _redump(): + global IMG_DIR + try: + restored_pid = _get_restored_pid() + except FileNotFoundError: + sys.exit(1) + IMG_DIR = "dumpdir2" + check_dumpdir(IMG_DIR) + ret = criu_ns_dump(restored_pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _re_restore(): + os.unlink("running") + ret = criu_ns_restore() + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + check_dumpdir() + _dump() + _restore() + _redump() + _re_restore() + + +def main(): + test_dump_and_restore_with_shell_job() + test_dump_and_restore_without_shell_job() + test_dump_and_restore_without_shell_job(restore_detached=True) + test_dump_and_restore_in_pidns() + + +if __name__ == "__main__": + run_task_with_own_pty(main) diff --git a/test/others/env.sh b/test/others/env.sh index 45066f760b..6fa2c9691b 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -1,17 +1,13 @@ #!/bin/sh -CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) +BASE_DIR="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/../../")" + +CRIU="${BASE_DIR}/criu/criu" criu=$CRIU -if [ $(which python3) ]; then - PYTHON=python3 -elif [ $(which python2) ]; then - PYTHON=python2 -else - echo "FAIL: Neither python3 nor python2" - exit 1 -fi -#export PYTHON -CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") + +export PYTHONPATH="${BASE_DIR}/lib:${BASE_DIR}/crit:${PYTHONPATH-}" +CRIT="python3 -m crit" crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump-"${PYTHON}") + +CRIU_COREDUMP="${BASE_DIR}/coredump/coredump" criu_coredump=$CRIU_COREDUMP diff --git a/test/others/ext-tty/run.py b/test/others/ext-tty/run.py index 8109033cb9..2c268a2c8f 100755 --- a/test/others/ext-tty/run.py +++ b/test/others/ext-tty/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import os, sys, time, signal, pty diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 5a1f44450a..3f6163e084 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -2,7 +2,7 @@ set -e -x # construct root -python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns +python3 ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop @@ -11,7 +11,7 @@ dev=`losetup --find --show zdtm.loop` mkdir -p ../../dev cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev -python ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? +python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev unlink zdtm.loop exit $ret diff --git a/test/others/mounts/mounts.sh b/test/others/mounts/mounts.sh index 51ea69540d..bed156a50c 100755 --- a/test/others/mounts/mounts.sh +++ b/test/others/mounts/mounts.sh @@ -20,7 +20,7 @@ for i in `awk '{ print $2 }' < /proc/self/mounts`; do umount -l $i done -python mounts.py +python3 mounts.py kill $INMNTNS_PID while :; do sleep 10 diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index e416f95e53..4ebe3e2801 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -4,8 +4,6 @@ set -x if [[ "$1" == "pid" ]]; then NS=pid - # CentOS 7 kernels do not have NSpid -> skip this test - grep NSpid /proc/self/status || exit 0 else NS=net fi diff --git a/test/others/ns_ext/run_pidns.sh b/test/others/ns_ext/run_pidns.sh index 08c5bff8e8..db12106e03 100755 --- a/test/others/ns_ext/run_pidns.sh +++ b/test/others/ns_ext/run_pidns.sh @@ -2,9 +2,6 @@ set -e -# CentOS 7 kernels do not have NSpid -> skip this test -grep NSpid /proc/self/status || exit 0 - # This test creates a process in non-host pidns and then dumps it and restores # it into host pidns. We use pid >100000 in non-host pidns to make sure it does # not intersect with some host pid on restore but it is potentially racy so diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index fc64f0c977..69537bb0d3 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -4,7 +4,7 @@ all: test-c rpc_pb2.py criu CFLAGS += -g -Werror -Wall -I. LDLIBS += -lprotobuf-c -PYTHON ?= python +PYTHON ?= python3 run: all @make -C .. loop diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 90c80fcaea..6cffe270d0 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import os diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index f84757efd6..b600b6d1c4 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Test criu errno import socket, os, errno diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index b51357d426..daeda49bce 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys, errno import rpc_pb2 as rpc diff --git a/test/others/rpc/restore-loop.py b/test/others/rpc/restore-loop.py index 84a2ce56d1..67110c2cf5 100755 --- a/test/others/rpc/restore-loop.py +++ b/test/others/rpc/restore-loop.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 9be5775872..afd4fb5e33 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -1,14 +1,6 @@ #!/bin/bash -set -ex - -if [ -e /etc/os-release ]; then - . /etc/os-release - if [ "$ID" == "centos" ] && [[ "$VERSION_ID" == "7"* ]];then - echo "Skipping tests on CentOS 7 because they do not work in CI" - exit 0 - fi -fi +set -e CRIU=./criu diff --git a/test/others/rpc/setup_swrk.py b/test/others/rpc/setup_swrk.py index c7f84f952a..ffaa01de42 100644 --- a/test/others/rpc/setup_swrk.py +++ b/test/others/rpc/setup_swrk.py @@ -5,12 +5,6 @@ def setup_swrk(): print('Connecting to CRIU in swrk mode.') s1, s2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) - - kwargs = {} - if sys.version_info.major == 3: - kwargs["pass_fds"] = [s1.fileno()] - - swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], **kwargs) + swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], pass_fds=[s1.fileno()]) s1.close() return swrk, s2 - diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index 80f6338f45..ce8411bc60 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/version.py b/test/others/rpc/version.py index 9d7fa745b5..a18cd5b7b7 100755 --- a/test/others/rpc/version.py +++ b/test/others/rpc/version.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import rpc_pb2 as rpc diff --git a/test/others/shell-job/run.py b/test/others/shell-job/run.py index a59945d6a7..969965f00f 100755 --- a/test/others/shell-job/run.py +++ b/test/others/shell-job/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os, pty, sys, subprocess import termios, fcntl, time diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh index 0803d78eca..0776ebf618 100755 --- a/test/others/skip-file-rwx-check/run.sh +++ b/test/others/skip-file-rwx-check/run.sh @@ -10,11 +10,11 @@ source ../env.sh make clean touch testfile chmod +w testfile -tail --follow testfile & -tailpid=$! -if ! "$criu" dump --tree=$tailpid --shell-job --verbosity=4 --log-file=dump.log +bash -c 'exec 3= 5: - import importlib.util - spec = importlib.util.spec_from_file_location(name, path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - else: - import imp - mod = imp.load_source(name, path) + import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) return mod @@ -828,7 +826,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self._bins += self.__subs self._deps += get_test_desc('zdtm/lib/groups')['deps'] - self._env = {'ZDTM_TESTS': self.__real_name} + self._env['ZDTM_TESTS'] = self.__real_name def __get_start_cmd(self, name): tdir = os.path.dirname(name) @@ -838,7 +836,7 @@ def __get_start_cmd(self, name): subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout=subprocess.PIPE) - out, _ = s.communicate() + out, _ = s.communicate(timeout=self.__timeout) cmd = out.decode().splitlines()[-1].strip() return 'cd /' + tdir + ' && ' + cmd @@ -882,7 +880,8 @@ def run(action, fault=None, strace=[], preexec=None, - nowait=False): + nowait=False, + timeout=60): env = dict( os.environ, ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") @@ -898,7 +897,7 @@ def run(action, preexec_fn=preexec) if nowait: return cr - return cr.wait() + return cr.wait(timeout=timeout) class criu_rpc_process: @@ -981,7 +980,8 @@ def run(action, fault=None, strace=[], preexec=None, - nowait=False): + nowait=False, + timeout=None): if fault: raise test_fail_exc('RPC and FAULT not supported') if strace: @@ -1052,6 +1052,7 @@ def __init__(self, opts): self.__sat = bool(opts['sat']) self.__dedup = bool(opts['dedup']) self.__mdedup = bool(opts['noauto_dedup']) + self.__skip_zero_pages = bool(opts['skip_zero_pages']) self.__user = bool(opts['user']) self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) @@ -1381,6 +1382,9 @@ def dump(self, action, opts=[]): if self.__dedup: a_opts += ["--auto-dedup"] + if self.__skip_zero_pages: + a_opts += ["--skip-zero-pages"] + a_opts += ["--timeout", "10"] criu_dir = os.path.dirname(os.getcwd()) @@ -2010,12 +2014,20 @@ def __init__(self, opts, nr_tests): file=self.__file_report) print(u"# ", file=self.__file_report) print(u"1.." + str(nr_tests), file=self.__file_report) - with open("/proc/sys/kernel/tainted") as taintfd: - self.__taint = taintfd.read() + self.__taint = self.__read_kernel_tainted() if int(self.__taint, 0) != 0: - print("The kernel is tainted: %r" % self.__taint) - if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != '1': - raise Exception("The kernel is tainted: %r" % self.__taint) + self.__report_kernel_taint("The kernel is tainted: %r" % self.__taint) + + @staticmethod + def __read_kernel_tainted(): + with open("/proc/sys/kernel/tainted") as taintfd: + return taintfd.read().strip() + + @staticmethod + def __report_kernel_taint(msg): + print(msg) + if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != "1": + raise Exception(msg) def __show_progress(self, msg): perc = int(self.__nr * 16 / self.__total) @@ -2041,11 +2053,12 @@ def run_test(self, name, desc, flavor): if len(self.__subs) >= self.__max: self.wait() - with open("/proc/sys/kernel/tainted") as taintfd: - taint = taintfd.read() + taint = self.__read_kernel_tainted() if self.__taint != taint: - raise Exception("The kernel is tainted: %r (%r)" % - (taint, self.__taint)) + prev_taint = self.__taint + self.__taint = taint + self.__report_kernel_taint( + "The kernel is tainted: %r (was %r)" % (taint, prev_taint)) ''' The option --link-remap allows criu to hardlink open files back to the @@ -2074,7 +2087,7 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless') + 'rootless', 'skip_zero_pages') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2228,9 +2241,21 @@ def all_tests(opts): continue files.append(fp) excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) - tlist = list(filter( + tlist = list(sorted(filter( lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and - x not in excl, map(lambda x: x.strip(), files))) + x not in excl, map(lambda x: x.strip(), files)))) + + if opts.get('test_shard_count'): + if opts.get('test_shard_index') is None: + raise KeyError('--test_shard_count > 0 must come with --test_shard_index') + slice_idx = opts['test_shard_index'] + slices = opts['test_shard_count'] + if slice_idx >= slices: + raise IndexError('--test_shard_index not less than --test_shard_count ({} >= {})'.format(slice_idx, slices)) + slist = list(tlist[slice_idx::slices]) + print("We're shard #{} of {}. Running {} of {} tests.\n".format(slice_idx, slices, len(slist), len(tlist))) + tlist = slist + return tlist @@ -2341,11 +2366,6 @@ def run_tests(opts): return torun = list(torun) - if opts['keep_going'] and len(torun) < 2: - print( - "[WARNING] Option --keep-going is more useful when running multiple tests" - ) - opts['keep_going'] = False if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") @@ -2388,6 +2408,7 @@ def run_tests(opts): "Specify --criu-image-streamer-dir or modify PATH to provide an alternate location") .format(streamer_dir)) + usernsIsSupported = criu.check("userns") launcher = Launcher(opts, len(torun)) try: for t in torun: @@ -2457,7 +2478,7 @@ def run_tests(opts): run_flavs = set(test_flavs) & set(opts_flavs) else: run_flavs = set([test_flavs.pop()]) - if not criu.check("userns"): + if not usernsIsSupported: run_flavs -= set(['uns']) if opts['user']: # FIXME -- probably uns will make sense @@ -2680,6 +2701,9 @@ def get_cli_args(): rp.add_argument("--noauto-dedup", help="Manual deduplicate images on iterations", action='store_true') + rp.add_argument("--skip-zero-pages", + help="Don't dump pages containing only zero bytes", + action='store_true') rp.add_argument("--nocr", help="Do not CR anything, just check test works", action='store_true') @@ -2767,6 +2791,10 @@ def get_cli_args(): rp.add_argument("--mntns-compat-mode", help="Use old compat mounts restore engine", action='store_true') + rp.add_argument("--test-shard-index", type=int, default=None, + help="Select tests for a shard (0-based)") + rp.add_argument("--test-shard-count", type=int, default=0, + help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index d345233154..24f32c6068 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -23,12 +23,12 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 else ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) # To build aarch32 on armv8 Travis-CI (see criu Makefile) - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif endif @@ -40,8 +40,8 @@ endif PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes -CFLAGS += $(USERCFLAGS) -CFLAGS += -D_GNU_SOURCE +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) +CFLAGS += -D_GNU_SOURCE -D_LARGEFILE64_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include ifeq ($(strip $(V)),) diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 90bd28f9e1..428d726d66 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -1,6 +1,6 @@ LIBDIR := . -CFLAGS += $(USERCFLAGS) +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) LIB := libzdtmtst.a @@ -34,4 +34,4 @@ clean: clean-more $(LIB): $(LIBOBJ) $(E) " AR " $@ - $(Q)ar rcs $@ $^ + $(Q)$(AR) rcs $@ $^ diff --git a/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h new file mode 100644 index 0000000000..1803aaeb44 --- /dev/null +++ b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h @@ -0,0 +1,49 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +#define atomic_get(v) (*(volatile int *)v) +#define atomic_set(v, i) (*(v) = (i)) + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(*v), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub_return(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(*ptr) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index bf8cd9cd31..efcc7a1d08 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); diff --git a/test/zdtm/lib/lock.h b/test/zdtm/lib/lock.h index 2b23550be5..cc5306e060 100644 --- a/test/zdtm/lib/lock.h +++ b/test/zdtm/lib/lock.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "asm/atomic.h" #define BUG_ON(condition) \ diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 6291ea4a7b..a5ba38b2dd 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -406,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4b3d2e3418..9689a939a3 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -62,6 +62,7 @@ TST_NOFILE := \ pthread_timers \ pthread_timers_h \ rseq00 \ + membarrier \ vdso00 \ vdso01 \ vdso02 \ @@ -84,7 +85,8 @@ TST_NOFILE := \ socket-tcp4v6 \ socket-tcp-local \ socket-tcp-reuseport \ - socket-tcp-nfconntrack \ + socket-tcp-ipt-nfconntrack \ + socket-tcp-nft-nfconntrack \ socket-tcp6-local \ socket-tcp4v6-local \ socket-tcpbuf \ @@ -215,6 +217,7 @@ TST_NOFILE := \ seccomp_filter_tsync \ seccomp_filter_threads \ seccomp_filter_inheritance \ + seccomp_no_new_privs \ different_creds \ vsx \ bridge \ @@ -256,6 +259,8 @@ TST_NOFILE := \ memfd02 \ memfd02-hugetlb \ memfd03 \ + memfd04 \ + memfd05 \ shmemfd \ shmemfd-priv \ time \ @@ -265,6 +270,8 @@ TST_NOFILE := \ sigtrap \ sigtrap01 \ change_mnt_context \ + fd_offset \ + zero_pages \ # jobctl00 \ PKG_CONFIG ?= pkg-config @@ -272,7 +279,7 @@ pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ - bpf_array + bpf_array endif ifneq ($(ARCH),arm) @@ -593,7 +600,8 @@ socket-tcpbuf6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp4v6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV4V6 socket-tcp-local: CFLAGS += -D ZDTM_TCP_LOCAL -socket-tcp-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_CONNTRACK +socket-tcp-ipt-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_IPT_CONNTRACK +socket-tcp-nft-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_NFT_CONNTRACK socket_listen6: CFLAGS += -D ZDTM_IPV6 socket_listen4v6: CFLAGS += -D ZDTM_IPV4V6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 @@ -653,6 +661,7 @@ socket-tcp6-unconn: CFLAGS += -D ZDTM_IPV6 socket-tcp4v6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV4V6 socket-tcp4v6-closing: CFLAGS += -D ZDTM_IPV4V6 memfd02-hugetlb: CFLAGS += -D ZDTM_HUGETLB +memfd05: CFLAGS += -D ZDTM_MEMFD05 sockets00-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET sockets01-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa469..dc16368217 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b49..0bc36048cf 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264d..7bfb677623 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a089..8a925c0a43 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 8c40ffd6bd..f586a0628d 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -17,25 +17,25 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *cgname = "zdtmtst"; +static const char *const cgname = "zdtmtst"; int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", dirname); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", subdir); return -1; } if (mount("none", subdir, "cgroup", 0, controller)) { - pr_perror("Can't mount cgroups"); + pr_perror("Can't mount cgroup controller %s at %s", controller, subdir); goto err_rd; } @@ -52,7 +52,8 @@ int mount_and_add(const char *controller, const char *path, const char *prop, co goto err_rs; ssprintf(paux, "%s/%s/special_prop_check", subdir, path); - mkdir(paux, 0600); + if (mkdir(paux, 0600) < 0) + pr_perror("Can't make dir %s", paux); return 0; err_rs: @@ -74,11 +75,11 @@ bool checkval(char *path, char *val) } n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) + pr_perror("read %s", path); close(fd); - if (n < 0) { - pr_perror("read"); + if (n < 0) return false; - } buf[n] = 0; if (strcmp(val, buf)) { @@ -95,7 +96,7 @@ int main(int argc, char **argv) char buf[1024], path[PATH_MAX]; struct stat sb; - char *dev_allow[] = { + const char *const dev_allow[] = { "c *:* m", "b *:* m", "c 1:3 rwm", "c 1:5 rwm", "c 1:7 rwm", "c 5:0 rwm", "c 5:2 rwm", "c 1:8 rwm", "c 1:9 rwm", "c 136:* rwm", "c 10:229 rwm", }; @@ -126,12 +127,14 @@ int main(int argc, char **argv) sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); if (!checkval(path, buf)) { + errno = 0; fail(); goto out; } sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); if (!checkval(path, "268435456\n")) { + errno = 0; fail(); goto out; } @@ -143,6 +146,7 @@ int main(int argc, char **argv) } if (!S_ISDIR(sb.st_mode)) { + errno = 0; fail("special_prop_check not a directory?"); goto out; } diff --git a/test/zdtm/static/cgroup04.checkskip b/test/zdtm/static/cgroup04.checkskip index 205f8fc530..1ccbada4d0 100755 --- a/test/zdtm/static/cgroup04.checkskip +++ b/test/zdtm/static/cgroup04.checkskip @@ -1,3 +1,20 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +for ctl in devices memory; do + # Check that the controller is available. + + grep -q "^${ctl}\\s" /proc/cgroups + + # Check that the controller is not co-mounted with any other. + + # /proc/self/cgroup may have: + # "1:devices:/sys" + if ! grep -q "^[0-9]*:${ctl}:" /proc/self/cgroup; then + # but not eg: + # "1:devices,job:/sys" + grep -qE "^[0-9]*:([^:]*,)?${ctl}(,[^:]*)?:" /proc/self/cgroup && exit 1 + fi +done diff --git a/test/zdtm/static/cgroup_ifpriomap.checkskip b/test/zdtm/static/cgroup_ifpriomap.checkskip index 205f8fc530..f401ad1b24 100755 --- a/test/zdtm/static/cgroup_ifpriomap.checkskip +++ b/test/zdtm/static/cgroup_ifpriomap.checkskip @@ -1,3 +1,6 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +grep -q '^net_prio\s' /proc/cgroups diff --git a/test/zdtm/static/cgroup_yard.hook b/test/zdtm/static/cgroup_yard.hook index d06bc45fde..b70bd59e94 100755 --- a/test/zdtm/static/cgroup_yard.hook +++ b/test/zdtm/static/cgroup_yard.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b3..8787ae5cf9 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/child_opened_proc.c b/test/zdtm/static/child_opened_proc.c index 2125cd264e..cfe04fa4be 100644 --- a/test/zdtm/static/child_opened_proc.c +++ b/test/zdtm/static/child_opened_proc.c @@ -10,7 +10,7 @@ #include "zdtmtst.h" const char *test_doc = "Check that tree prior to files opening"; -const char *test_author = "Stanislav Kinsbursky + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check that criu properly restores offsets on ELF files"; +const char *test_author = "Michal Clapinski "; + +void check_offset(int fd) +{ + int offset = lseek(fd, 0, SEEK_CUR); + if (offset < 0) { + fail("lseek"); + exit(1); + } + if (offset != 0) { + fail("wrong offset; expected: 0, got: %d", offset); + exit(1); + } +} + +int main(int argc, char **argv) +{ + int fd; + + test_init(argc, argv); + + fd = open("/proc/self/exe", O_RDONLY); + if (fd < 0) { + fail("open"); + exit(1); + } + check_offset(fd); + + test_daemon(); + test_waitsig(); + + check_offset(fd); + + pass(); + return 0; +} diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5d..bfdca51d93 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa2..ae4827de97 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21b..228e668925 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa70..7e0d2654e1 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/file_locks06.checkskip b/test/zdtm/static/file_locks06.checkskip index 06ab585216..c5039a2d23 100755 --- a/test/zdtm/static/file_locks06.checkskip +++ b/test/zdtm/static/file_locks06.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fcntl import tempfile import struct diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c index b1e55e8614..f6989f3af7 100644 --- a/test/zdtm/static/maps00.c +++ b/test/zdtm/static/maps00.c @@ -137,7 +137,7 @@ static int check_map(struct map *map) } /* prot |= PROT_READ// need barrier before this line, because compiler change order commands. - I finded one method: look at next lines*/ + I found one method: look at next lines*/ } else prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c new file mode 100644 index 0000000000..85d705ba7b --- /dev/null +++ b/test/zdtm/static/membarrier.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test membarrier() migration"; +const char *test_author = "Michał Mirosław "; + +/* + * Define membarrier() CMDs to avoid depending on exact kernel header version. + */ +#define MEMBARRIER_CMD_GLOBAL_EXPEDITED (1 << 1) +#define MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED (1 << 2) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) +#define MEMBARRIER_CMD_GET_REGISTRATIONS (1 << 9) + +static int membarrier(int cmd, unsigned int flags, int cpu_id) +{ + return syscall(__NR_membarrier, cmd, flags, cpu_id); +} + +static const struct { + const char *name_suffix; + int register_cmd; + int execute_cmd; +} membarrier_cmds[] = { + { "GLOBAL_EXPEDITED", MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_GLOBAL_EXPEDITED }, + { "PRIVATE_EXPEDITED", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "PRIVATE_EXPEDITED_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "PRIVATE_EXPEDITED_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, +}; +static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); + +static int register_membarriers(void) +{ + int barriers_supported, barriers_registered; + bool all_ok = true; + + barriers_supported = membarrier(MEMBARRIER_CMD_QUERY, 0, 0); + if (barriers_supported < 0) { + fail("membarrier() not supported by running kernel"); + return -1; + } + + barriers_registered = 0; + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_supported & membarrier_cmds[i].register_cmd) + continue; + + barriers_registered |= membarrier_cmds[i].register_cmd; + + if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { + pr_perror("membarrier(REGISTER_%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) { + fail("can't register membarrier()s - tried %#x, kernel %#x", + barriers_registered, barriers_supported); + return -1; + } + + if (!barriers_registered) { + fail("no known membarrier() cmds are supported by the kernel"); + return -1; + } + + return barriers_registered; +} + +static bool check_membarriers_compat(int barriers_registered) +{ + bool all_ok = true; + + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_registered & membarrier_cmds[i].register_cmd) + continue; + if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { + pr_perror("membarrier(%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) + fail("membarrier() check failed"); + + return all_ok; +} + +static bool check_membarriers_get_registrations(int barriers_registered) +{ + int ret = membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + if (errno == EINVAL) { + test_msg("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS) not supported by running kernel"); + return true; + } + fail("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS)"); + return false; + } + if (ret != barriers_registered) { + fail("MEMBARRIER_CMD_GET_REGISTRATIONS check failed, expected: %d, got: %d", + barriers_registered, ret); + return false; + } + + return true; +} + +static bool check_membarriers(int barriers_registered) +{ + return check_membarriers_compat(barriers_registered) && + check_membarriers_get_registrations(barriers_registered); +} + +int main(int argc, char **argv) +{ + int barriers_registered; + + test_init(argc, argv); + + barriers_registered = register_membarriers(); + if (barriers_registered < 0) + return 1; + + test_msg("Pre-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + test_daemon(); + test_waitsig(); + + test_msg("Post-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + pass(); + return 0; +} diff --git a/test/zdtm/static/memfd00.c b/test/zdtm/static/memfd00.c index d037f69697..8d77ed06eb 100644 --- a/test/zdtm/static/memfd00.c +++ b/test/zdtm/static/memfd00.c @@ -30,8 +30,10 @@ int main(int argc, char *argv[]) { int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; struct statfs statfs1, statfs2; + struct stat stat; off_t pos1, pos2; char buf[5]; + int fmode1, fmode2; test_init(argc, argv); @@ -58,6 +60,13 @@ int main(int argc, char *argv[]) if (lseek(fd, pos1, SEEK_SET) < 0) err(1, "seek error"); + if (fchmod(fd, 0642)) + err(1, "Can't set permission bits"); + + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode1 = stat.st_mode; + test_daemon(); test_waitsig(); @@ -85,6 +94,15 @@ int main(int argc, char *argv[]) return 1; } + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode2 = stat.st_mode; + + if (fmode1 != fmode2) { + fail("stat.st_mode = %#o != %#o", fmode2, fmode1); + return 1; + } + pos2 = lseek(fd, 0, SEEK_CUR); if (pos1 != pos2) { fail("position differs"); diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c new file mode 100644 index 0000000000..215e949d15 --- /dev/null +++ b/test/zdtm/static/memfd04.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "exec(memfd)"; +const char *test_author = "Michał Mirosław "; + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +static int _execveat(int dirfd, const char *pathname, const char *const argv[], const char *const envp[], int flags) +{ + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); +} + +static const char *const script_argv[] = { "true", NULL }; +static const char *const script_env[] = { NULL }; + +static bool test_exec_fd(int fd) +{ + int err, pid, status; + + err = fcntl(fd, F_GETFD); + if (err < 0) { + fail("fcntl(F_GETFD)"); + return false; + } + if (err) { + errno = 0; + fail("F_GETFD for the memfd returned %d but expected 0", err); + return false; + } + + pid = fork(); + if (!pid) { + _execveat(fd, "", script_argv, script_env, AT_EMPTY_PATH); + err = errno; + pr_perror("execveat()"); + _exit(err); + } + + if (pid < 0) { + fail("fork()"); + return false; + } + + while (waitpid(pid, &status, 0) != pid) { + if (errno == EINTR) + continue; + fail("waitpid(child=%d)", pid); + return false; + } + + if (status != 0) { + pr_err("child exited with status=%d\n", status); + return false; + } + + return true; +} + +static const char script[] = "#!/bin/true"; +static const size_t script_len = sizeof(script) - 1; + +int main(int argc, char *argv[]) +{ +#ifdef MEMFD05 + char path[PATH_MAX]; + char *addr_p, *addr_s; + int rofd; +#endif + int fd; + + test_init(argc, argv); + + fd = _memfd_create("somename", 0); + if (fd < 0) { + pr_perror("memfd_create()"); + return 1; + } + if (ftruncate(fd, script_len) == -1) { + pr_perror("ftruncate"); + return 1; + } + if (write(fd, script, script_len) != script_len) { + pr_perror("write(memfd)"); + return 1; + } +#ifdef MEMFD05 + snprintf(path, PATH_MAX - 1, "/proc/self/fd/%d", fd); + rofd = open(path, O_RDONLY); + if (rofd < 0) { + pr_perror("unable to open read-only memfd"); + return 1; + } + addr_p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, rofd, 0); + if (addr_p == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + addr_s = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr_s == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } +#endif + + if (!test_exec_fd(fd)) + return 1; + + test_msg("execveat(memfd) succeeded before C/R.\n"); + + test_daemon(); + test_waitsig(); + + if (!test_exec_fd(fd)) + return 1; + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd04.desc b/test/zdtm/static/memfd04.desc new file mode 100644 index 0000000000..bbf136d145 --- /dev/null +++ b/test/zdtm/static/memfd04.desc @@ -0,0 +1 @@ +{'deps': ['/bin/true']} diff --git a/test/zdtm/static/memfd05.c b/test/zdtm/static/memfd05.c new file mode 120000 index 0000000000..6caa9556fb --- /dev/null +++ b/test/zdtm/static/memfd05.c @@ -0,0 +1 @@ +memfd04.c \ No newline at end of file diff --git a/test/zdtm/static/memfd05.desc b/test/zdtm/static/memfd05.desc new file mode 120000 index 0000000000..1b4963572b --- /dev/null +++ b/test/zdtm/static/memfd05.desc @@ -0,0 +1 @@ +memfd04.desc \ No newline at end of file diff --git a/test/zdtm/static/mntns_open.c b/test/zdtm/static/mntns_open.c index 7d8bbbaa4e..0430f5b998 100644 --- a/test/zdtm/static/mntns_open.c +++ b/test/zdtm/static/mntns_open.c @@ -17,7 +17,7 @@ #define CLONE_NEWNS 0x00020000 #endif -const char *test_doc = "Check that mnt_id is repsected"; +const char *test_doc = "Check that mnt_id is respected"; const char *test_author = "Pavel Emelianov "; #define MPTS_FILE "F" diff --git a/test/zdtm/static/mntns_root_bind.c b/test/zdtm/static/mntns_root_bind.c index 9e1ba06e63..4c0347cb22 100644 --- a/test/zdtm/static/mntns_root_bind.c +++ b/test/zdtm/static/mntns_root_bind.c @@ -71,7 +71,7 @@ int main(int argc, char **argv) task_waiter_wait4(&t, 2); if (access(bspath, F_OK)) { - fail("%s isn't accessiable", bspath); + fail("%s isn't accessible", bspath); return 1; } diff --git a/test/zdtm/static/net_lock_socket_iptables.hook b/test/zdtm/static/net_lock_socket_iptables.hook index 0ee147eb2b..e9fcd73509 100755 --- a/test/zdtm/static/net_lock_socket_iptables.hook +++ b/test/zdtm/static/net_lock_socket_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import socket import time diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea5..f268f2fece 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/netns_lock_iptables.hook b/test/zdtm/static/netns_lock_iptables.hook index e7daf8a655..b51d3c2cc2 100755 --- a/test/zdtm/static/netns_lock_iptables.hook +++ b/test/zdtm/static/netns_lock_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import socket @@ -67,7 +67,7 @@ if sys.argv[1] == "--post-start": cln, addr = srv.accept() cln.sendall(str.encode("--post-restore")) cln.close() - + # Server will be closed when zdtm sends SIGKILL if sys.argv[1] == "--pre-dump": diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f52..a68fa38eeb 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); diff --git a/test/zdtm/static/seccomp_filter_inheritance.c b/test/zdtm/static/seccomp_filter_inheritance.c index 7a86cd85ee..5afcb3f845 100644 --- a/test/zdtm/static/seccomp_filter_inheritance.c +++ b/test/zdtm/static/seccomp_filter_inheritance.c @@ -100,7 +100,7 @@ int main(int argc, char **argv) if (filter_syscall(__NR_ptrace) < 0) _exit(1); - if (filter_syscall(__NR_fstat) < 0) + if (filter_syscall(__NR_statx) < 0) _exit(1); zdtm_seccomp = 1; diff --git a/test/zdtm/static/seccomp_no_new_privs.c b/test/zdtm/static/seccomp_no_new_privs.c new file mode 100644 index 0000000000..95f9501ed4 --- /dev/null +++ b/test/zdtm/static/seccomp_no_new_privs.c @@ -0,0 +1,42 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that NO_NEW_PRIVS attribute is restored"; +const char *test_author = "Michał Mirosław "; + +int main(int argc, char **argv) +{ + int ret; + + test_init(argc, argv); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 0) + fail("initial NO_NEW_PRIVS = %d != 0", ret); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (ret) { + pr_perror("Can't set NO_NEW_PRIVS attribute"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 1) + fail("restored NO_NEW_PRIVS = %d != 1", ret); + + pass(); + return 0; +} diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c index 08970c0daf..d890410d89 100644 --- a/test/zdtm/static/sock_ip_opts00.c +++ b/test/zdtm/static/sock_ip_opts00.c @@ -3,6 +3,7 @@ #include #include +#include #include #include "zdtmtst.h" @@ -19,11 +20,13 @@ const char *test_author = "Pavel Tikhomirov "; struct sk_opt { int level; int opt; + int val; }; struct sk_opt sk_opts_v4[] = { - { SOL_IP, IP_FREEBIND }, - { SOL_IP, IP_PKTINFO }, + { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, + { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, + { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, }; #ifndef IPV6_FREEBIND @@ -31,8 +34,8 @@ struct sk_opt sk_opts_v4[] = { #endif struct sk_opt sk_opts_v6[] = { - { SOL_IPV6, IPV6_FREEBIND }, - { SOL_IPV6, IPV6_RECVPKTINFO }, + { SOL_IPV6, IPV6_FREEBIND, IP_OPT_VAL }, + { SOL_IPV6, IPV6_RECVPKTINFO, IP_OPT_VAL }, }; struct sk_conf { @@ -71,7 +74,7 @@ int main(int argc, char **argv) n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); for (j = 0; j < n_opts; j++) { - val = IP_OPT_VAL; + val = opts[j].val; if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); goto close; @@ -93,7 +96,7 @@ int main(int argc, char **argv) goto close; } - if (val != IP_OPT_VAL) { + if (val != opts[j].val) { fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); goto close; diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index 5b4624f6de..fcf00ffed8 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -12,22 +12,28 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY -#define NOPTS 8 - int main(int argc, char **argv) { - int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; + #define OPT(x) { x, #x } + static const struct { + int opt; + const char *name; + } vname[] = { + OPT(SO_PRIORITY), + OPT(SO_RCVLOWAT), + OPT(SO_MARK), + OPT(SO_PASSCRED), + OPT(SO_PASSSEC), + OPT(SO_DONTROUTE), + OPT(SO_NO_CHECK), + OPT(SO_OOBINLINE), + }; + static const int NOPTS = sizeof(vname) / sizeof(*vname); + #undef OPT + + int sock, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); - vname[0] = SO_PRIORITY; - vname[1] = SO_RCVLOWAT; - vname[2] = SO_MARK; - vname[3] = SO_PASSCRED; - vname[4] = SO_PASSSEC; - vname[5] = SO_DONTROUTE; - vname[6] = SO_NO_CHECK; - vname[7] = SO_OOBINLINE; - test_init(argc, argv); sock = socket(PF_INET, SOCK_STREAM, 0); @@ -37,29 +43,29 @@ int main(int argc, char **argv) } for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { - pr_perror("can't get option %d", i); + pr_perror("can't get %s", vname[i].name); return 1; } val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); + ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { - pr_perror("can't set option %d", i); + pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d 2", i); + pr_perror("can't re-get %s", vname[i].name); return 1; } if (rval != val[i]) { if (rval + 1 == val[i]) { - pr_perror("can't reset option %d want %d have %d", i, val[i], rval); + pr_perror("failed to set %s: want %d have %d", vname[i].name, val[i], rval); return 1; } @@ -72,14 +78,15 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d again", i); + pr_perror("can't verify %s", vname[i].name); return 1; } if (val[i] != rval) { - fail("option %d changed", i); + errno = 0; + fail("%s changed: %d -> %d", vname[i].name, val[i], rval); return 1; } } diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 9504557dad..30f8ce0710 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,7 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys -sys.path.append("../crit") +sys.path.append("../lib") import pycriu import os, os.path diff --git a/test/zdtm/static/socket-tcp-nfconntrack.c b/test/zdtm/static/socket-tcp-ipt-nfconntrack.c similarity index 100% rename from test/zdtm/static/socket-tcp-nfconntrack.c rename to test/zdtm/static/socket-tcp-ipt-nfconntrack.c diff --git a/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc new file mode 100644 index 0000000000..53dd822854 --- /dev/null +++ b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'has_ipt_legacy', + 'flavor': 'h', + 'opts': '--tcp-established', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc deleted file mode 100644 index add2513f81..0000000000 --- a/test/zdtm/static/socket-tcp-nfconntrack.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.c b/test/zdtm/static/socket-tcp-nft-nfconntrack.c new file mode 120000 index 0000000000..8cb60dd03a --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.desc b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc new file mode 100644 index 0000000000..38a4eb3897 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc @@ -0,0 +1,7 @@ +{ + 'flavor': 'h', + 'feature': 'network_lock_nftables', + 'opts': '--tcp-established', + 'dopts': '--network-lock nftables', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index f6ef473853..9830c7860a 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -67,17 +67,38 @@ int main(int argc, char **argv) int val; socklen_t optlen; -#ifdef ZDTM_CONNTRACK +#ifdef ZDTM_IPT_CONNTRACK if (unshare(CLONE_NEWNET)) { pr_perror("unshare"); return 1; } if (system("ip link set up dev lo")) return 1; - if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + + if (system("iptables-legacy -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + return 1; + if (system("iptables-legacy -w -A INPUT -j DROP")) + return 1; + +#endif + +#ifdef ZDTM_NFT_CONNTRACK + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); return 1; - if (system("iptables -w -A INPUT -j DROP")) + } + if (system("ip link set up dev lo")) + return 1; + + if (system("nft add table ip filter")) return 1; + if (system("nft add chain ip filter INPUT")) + return 1; + if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) + return 1; + if (system("nft add rule ip filter INPUT counter drop")) + return 1; + #endif #ifdef ZDTM_TCP_LOCAL diff --git a/test/zdtm/static/socket_udp_shutdown.c b/test/zdtm/static/socket_udp_shutdown.c index 91dc8f30a4..a7658b9dd7 100644 --- a/test/zdtm/static/socket_udp_shutdown.c +++ b/test/zdtm/static/socket_udp_shutdown.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) test_init(argc, argv); - sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); - sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + sk1 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); + sk2 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); if (sk1 < 0 || sk2 < 0) { pr_perror("Can't create socket"); exit(1); diff --git a/test/zdtm/static/stopped.c b/test/zdtm/static/stopped.c index 059a2a92aa..26b0174eda 100644 --- a/test/zdtm/static/stopped.c +++ b/test/zdtm/static/stopped.c @@ -65,7 +65,7 @@ int main(int argc, char **argv) } if (WIFSTOPPED(status)) - test_msg("The procces stopped\n"); + test_msg("The process stopped\n"); else { fail("The process doesn't stopped"); goto out; diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index ab88120c2c..55609f2605 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -17,6 +17,7 @@ int main(int argc, char **argv) unsigned long orig_flags = 0, new_flags = 0; unsigned long orig_madv = 0, new_madv = 0; void *area; + int ret; test_init(argc, argv); @@ -35,9 +36,46 @@ int main(int argc, char **argv) return -1; } + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); + return -1; + } + + test_msg("Fetch pre-migration flags/adv\n"); + if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) + return -1; + + errno = 0; + if (orig_flags != new_flags) { + fail("Flags changed %lx -> %lx", orig_flags, new_flags); + return -1; + } + + if (orig_madv != new_madv) { + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); + return -1; + } + test_daemon(); test_waitsig(); + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting post-migration THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); + return -1; + } + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_perror("Enabling THP failed"); return -1; @@ -47,15 +85,14 @@ int main(int argc, char **argv) if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) return -1; + errno = 0; if (orig_flags != new_flags) { - pr_err("Flags are changed %lx -> %lx\n", orig_flags, new_flags); - fail(); + fail("Flags changed %lx -> %lx", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - pr_err("Madvs are changed %lx -> %lx\n", orig_madv, new_madv); - fail(); + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); return -1; } diff --git a/test/zdtm/static/thread_different_uid_gid.c b/test/zdtm/static/thread_different_uid_gid.c index 3a0b6291b1..88f99659b3 100644 --- a/test/zdtm/static/thread_different_uid_gid.c +++ b/test/zdtm/static/thread_different_uid_gid.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); if (ret >= 0) { ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); - } else if (ret < 0) { + } else { pr_perror("Failed to drop privileges"); exit(1); } diff --git a/test/zdtm/static/zero_pages.c b/test/zdtm/static/zero_pages.c new file mode 100644 index 0000000000..700f0fcc00 --- /dev/null +++ b/test/zdtm/static/zero_pages.c @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check the --skip-zero-pages flag"; +const char *test_author = "Volker Simonis "; + +#define PME_PFRAME_MASK ((1ULL << 55) - 1) +uint64_t zero_page_pfn; +int page_size; +int pagemap; + +static uint64_t vaddr_to_pfn(unsigned long vaddr) +{ + uint64_t pfn; + off_t off = (vaddr / page_size) * sizeof(uint64_t); + if (pread(pagemap, &pfn, sizeof(pfn), off) != sizeof(pfn)) { + pr_perror("Can't read pme"); + exit(1); + } else { + return (pfn & PME_PFRAME_MASK); + } +} + +static void init_zero_page_pfn(void) +{ + void *addr; + if ((pagemap = open("/proc/self/pagemap", O_RDONLY)) == -1) { + pr_perror("Can't open /proc/self/pagemap"); + exit(1); + } + if ((addr = mmap(NULL, page_size, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == MAP_FAILED) { + pr_perror("Unable to map zero page"); + exit(1); + } + if (*((int *)addr) != 0) { + pr_perror("Newly mapped page must be zero"); + exit(1); + } + zero_page_pfn = vaddr_to_pfn((unsigned long)addr); + munmap(addr, page_size); + + if (zero_page_pfn == 0) { + pr_err("zero_page_pfn is invalid.\n"); + exit(1); + } + fprintf(stderr, "zero_page_pfn = %" PRIu64 "\n", zero_page_pfn); +} + +static int pages_in_mem(char *addr, int nr_of_pages) +{ + int counter = 0; + unsigned char pages[nr_of_pages]; + if (mincore(addr, page_size * nr_of_pages, pages) == -1) { + pr_perror("Can't call mincore"); + exit(1); + } + for (int i = 0; i < nr_of_pages; i++) { + if ((pages[i] & 0x1)) { + counter++; + } + } + return counter; +} + +static int zero_pages(char *addr, int nr_of_pages) +{ + int counter = 0; + for (int i = 0; i < nr_of_pages; i++, addr += page_size) { + if (vaddr_to_pfn((unsigned long)addr) == zero_page_pfn) { + counter++; + } + } + return counter; +} + +int main(int argc, char **argv) +{ + char *addr; + int nr_of_pages = 64; + + test_init(argc, argv); + + page_size = sysconf(_SC_PAGESIZE); + + init_zero_page_pfn(); + + addr = (char *)mmap(NULL, page_size * nr_of_pages, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap %d bytes", page_size * nr_of_pages); + exit(1); + } + /* Check that pages are not in memory yet */ + if (pages_in_mem(addr, nr_of_pages) != 0) { + pr_err("Pages shouldn't be in memory yet.\n"); + exit(1); + } + for (int i = 0; i < nr_of_pages; i++) { + /* Read pages to bring them into memory */ + if (addr[i * page_size] != 0) { + pr_err("All pages should have zero content.\n"); + exit(1); + } + } + /* Check that all pages reference the zero page */ + if (zero_pages(addr, nr_of_pages) != nr_of_pages) { + pr_err("All pages should reference the zero page.\n"); + exit(1); + } + for (int i = 0; i < nr_of_pages; i++) { + /* Write pages to COW them */ + addr[i * page_size] = 0; + } + /* Check that all pages are mapped to distinct physical pages */ + if (pages_in_mem(addr, nr_of_pages) != nr_of_pages) { + pr_err("All pages should be in memory.\n"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + /* Check that pages are not in memory yet */ + if (pages_in_mem(addr, nr_of_pages) != 0) { + fail("Pages shouldn't be in memory yet"); + goto out; + } + for (int i = 0; i < nr_of_pages; i++) { + /* Read pages to bring them into memory */ + if (addr[i * page_size] != 0) { + fail("All pages should have zero content"); + } + } + /* Check that all pages reference the zero page */ + if (zero_pages(addr, nr_of_pages) != nr_of_pages) { + fail("All pages should reference the zero page"); + goto out; + } + for (int i = 0; i < nr_of_pages; i++) { + /* Write pages to COW them */ + addr[i * page_size] = 0; + } + /* Check that all pages are mapped to distinct physical pages */ + if (pages_in_mem(addr, nr_of_pages) != nr_of_pages) { + fail("All pages should be in memory"); + goto out; + } + + pass(); +out: + return 0; +} diff --git a/test/zdtm/static/zero_pages.desc b/test/zdtm/static/zero_pages.desc new file mode 100644 index 0000000000..b9804bf35b --- /dev/null +++ b/test/zdtm/static/zero_pages.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--skip-zero-pages', 'flags': 'suid'} diff --git a/test/zdtm/transition/ipc.c b/test/zdtm/transition/ipc.c index 0f16dbc685..7660f70af0 100644 --- a/test/zdtm/transition/ipc.c +++ b/test/zdtm/transition/ipc.c @@ -178,7 +178,7 @@ int main(int argc, char **argv) pr_perror("Child 2 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 2 couldn't inititalise"); + pr_perror("Child 2 couldn't initialise"); } out_child: kill(pid1, SIGTERM); @@ -188,7 +188,7 @@ int main(int argc, char **argv) pr_perror("Child 1 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 1 couldn't inititalise"); + pr_perror("Child 1 couldn't initialise"); } out_shdt: shmdt(mem); diff --git a/test/zdtm/transition/lazy-thp.c b/test/zdtm/transition/lazy-thp.c index 2bf99dc4ca..2e9722b963 100644 --- a/test/zdtm/transition/lazy-thp.c +++ b/test/zdtm/transition/lazy-thp.c @@ -25,7 +25,7 @@ int main(int argc, char **argv) test_init(argc, argv); - /* we presume that malloc returns not page aliged address */ + /* we presume that malloc returns not page aligned address */ mem = malloc(PAGE_SIZE * N_PAGES); org = malloc(PAGE_SIZE); if (!mem || !org) { diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 5e849b904b..44316893da 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -102,7 +102,7 @@ int main(int argc, char **argv) /* * pidns is used to avoid conflicts * mntns is used to mount /proc - * net is used to avoid conflicts of parasite sockets + * net is used to avoid conflicts between network tests */ if (!uid) if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC))