diff --git a/.cirrus.yml b/.cirrus.yml index 671178d8b0..ef0de54e99 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -27,7 +27,7 @@ task: compute_engine_instance: image_project: centos-cloud - image: family/centos-8 + image: family/centos-stream-8 platform: linux cpu: 4 memory: 8G @@ -47,7 +47,7 @@ task: pip3 install junit_xml build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" task: name: CentOS 7 based test diff --git a/.clang-format b/.clang-format index dd4ade3703..96ba5909f4 100644 --- a/.clang-format +++ b/.clang-format @@ -15,7 +15,7 @@ AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlines: Left # Unknown to clang-format-4.0 AlignOperands: true -AlignTrailingComments: false +AlignTrailingComments: true AlignConsecutiveMacros: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index 7012132766..927ddced26 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [armv7-cross, aarch64-cross, ppc64-cross, mips64el-cross] + target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross] branches: [criu-dev, master] steps: diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 90862e7abd..be8e7f09c2 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -6,9 +6,26 @@ jobs: build: runs-on: ubuntu-latest + continue-on-error: ${{ matrix.experimental }} strategy: + fail-fast: false matrix: - target: [armv7-cross, aarch64-cross, ppc64-cross, mips64el-cross] + experimental: [false] + target: [ + armv7-stable-cross, + aarch64-stable-cross, + ppc64-stable-cross, + mips64el-stable-cross, + ] + include: + - experimental: true + target: armv7-unstable-cross + - experimental: true + target: aarch64-unstable-cross + - experimental: true + target: ppc64-unstable-cross + - experimental: true + target: mips64el-unstable-cross steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index f1b38e77e6..f782c5b9df 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -10,5 +10,7 @@ jobs: - uses: actions/checkout@v2 - name: Run Coverage Tests run: sudo -E make -C scripts/ci local GCOV=1 + - name: Run gcov + run: sudo -E find . -name '*gcda' -type f -print0 | sudo -E xargs --null --max-args 128 --max-procs 4 gcov - name: Run Coverage Analysis run: sudo -E make codecov diff --git a/.lgtm.yml b/.lgtm.yml new file mode 100644 index 0000000000..a28c35de0f --- /dev/null +++ b/.lgtm.yml @@ -0,0 +1,30 @@ +extraction: + cpp: + prepare: + packages: + - "protobuf-c-compiler" + - "libprotobuf-c-dev" + - "libprotobuf-dev" + - "build-essential" + - "libprotobuf-dev" + - "libprotobuf-c-dev" + - "protobuf-c-compiler" + - "protobuf-compiler" + - "python3-protobuf" + - "libnet-dev" + - "pkg-config" + - "libnl-3-dev" + - "libbsd0" + - "libbsd-dev" + - "iproute2" + - "libcap-dev" + - "libaio-dev" + - "python3-yaml" + - "libnl-route-3-dev" + - "python-future" + - "gnutls-dev" + configure: + command: + - "ls -laR images/google" + - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" + - "ls -laR images/google" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 96972296e0..864caf93e2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,8 +1,3 @@ -[![master](https://travis-ci.org/checkpoint-restore/criu.svg?branch=master)](https://travis-ci.org/checkpoint-restore/criu) -[![development](https://travis-ci.org/checkpoint-restore/criu.svg?branch=criu-dev)](https://travis-ci.org/checkpoint-restore/criu) -[![Codacy Badge](https://api.codacy.com/project/badge/Grade/55251ec7db28421da4481fc7c1cb0cee)](https://www.codacy.com/app/xemul/criu?utm_source=github.com&utm_medium=referral&utm_content=xemul/criu&utm_campaign=Badge_Grade) -

- ## How to contribute to CRIU CRIU project is (almost) the never-ending story, because we have to always keep up with the diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 690f61e14a..57b791138b 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -242,6 +242,12 @@ In other words, do not use it unless really needed. Tell *criu* that one end of a pair of UNIX sockets (created by *socketpair*(2)) with the given _id_ is OK to be disconnected. +*--external* **net[**__inode__**]:**__name__:: + Mark a network namespace as external and do not include it in the + checkpoint. The label 'name' can be used with *--inherit-fd* during + restore to specify a file descriptor to a preconfigured network + namespace. + *--external* **pid[**__inode__**]:**__name__:: Mark a PID namespace as external. This can be later used to restore a process into an existing PID namespace. The label 'name' can be @@ -328,7 +334,8 @@ mount -t cgroup -o devices,freezer none devices,freezer Checkpoint established TCP connections. *--tcp-close*:: - Don't dump the state of, or block, established tcp connections. + Don't dump the state of, or block, established tcp connections + (including the connection is once established but now closed). This is useful when tcp connections are not going to be restored. *--skip-in-flight*:: @@ -360,6 +367,10 @@ mount -t cgroup -o devices,freezer none devices,freezer Allows to link unlinked files back, if possible (modifies filesystem during *restore*). +*--timeout* 'number':: + Set a time limit in seconds for collecting tasks during the + dump operation. The timeout is 10 seconds by default. + *--ghost-limit* 'size':: Set the maximum size of deleted file to be carried inside image. By default, up to 1M file is allowed. Using this @@ -408,7 +419,7 @@ By default the option is set to *fpu* and *ins*. Set the method to be used to validate open files. Validation is done to ensure that the version of the file being restored is the same version when it was dumped. - ++ The 'mode' may be one of the following: *filesize*::: @@ -521,7 +532,7 @@ usually need to be escaped from shell. Restore cgroups configuration associated with a task from the image. Controllers are always restored in an optimistic way -- if already present in system, *criu* reuses it, otherwise it will be created. - ++ The 'mode' may be one of the following: *none*::: Do not restore cgroup properties but require cgroup to @@ -645,7 +656,7 @@ are not adequate, but this can be suppressed by using *--cpu-cap=none*. Set the method to be used to validate open files. Validation is done to ensure that the version of the file being restored is the same version when it was dumped. - ++ The 'mode' may be one of the following: *filesize*::: diff --git a/Makefile b/Makefile index 08761efed6..c0d0083718 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ ifeq ($(ARCH),arm) endif ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a + USERCFLAGS += -march=armv7-a+fp endif ifeq ($(ARMV),8) @@ -147,7 +147,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: flog criu lib crit .PHONY: all # @@ -233,6 +233,15 @@ soccr/built-in.o: $(CONFIG_HEADER) .FORCE $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) +#flog gets used by criu, build it earlier + +flogMakefile: ; +flog%: + $(Q) $(MAKE) $(build)=flog $@ +flog: + $(Q) $(MAKE) $(build)=flog all +.PHONY: flog + # # CRIU building done in own directory # with slightly different rules so we @@ -275,6 +284,7 @@ lib: crit clean mrproper: $(Q) $(MAKE) $(build)=images $@ + $(Q) $(MAKE) $(build)=flog $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ @@ -284,15 +294,19 @@ clean mrproper: $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper +clean-dummy_amdgpu_plugin: + $(Q) $(MAKE) -C plugins/amdgpu clean +.PHONY: clean dummy_amdgpu_plugin + clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top +clean: clean-top clean-dummy_amdgpu_plugin -mrproper-top: clean-top +mrproper-top: clean-top clean-dummy_amdgpu_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -320,6 +334,10 @@ test: zdtm $(Q) $(MAKE) -C test .PHONY: test +dummy_amdgpu_plugin: + $(Q) $(MAKE) -C plugins/amdgpu all +.PHONY: dummy_amdgpu_plugin + # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -409,11 +427,13 @@ lint: flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py flake8 --config=scripts/flake8.cfg scripts/criu-ns + flake8 --config=scripts/flake8.cfg coredump/ shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install shellcheck test/others/crit/*.sh shellcheck test/others/libcriu/*.sh + shellcheck test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck test/others/config-file/*.sh # Do not append \n to pr_perror or fail ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' @@ -429,7 +449,9 @@ lint: codecov: SHELL := $(shell which bash) codecov: - bash <(curl -s https://codecov.io/bash) + curl -Os https://uploader.codecov.io/latest/linux/codecov + chmod +x codecov + ./codecov .PHONY: codecov fetch-clang-format: .FORCE diff --git a/Makefile.install b/Makefile.install index 3987bcc6fb..52e8c06dad 100644 --- a/Makefile.install +++ b/Makefile.install @@ -7,6 +7,7 @@ MANDIR ?= $(PREFIX)/share/man INCLUDEDIR ?= $(PREFIX)/include LIBEXECDIR ?= $(PREFIX)/libexec RUNDIR ?= /run +PLUGINDIR ?= /var/lib/criu # # For recent Debian/Ubuntu with multiarch support. @@ -26,7 +27,7 @@ endif LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR -export LIBDIR INCLUDEDIR LIBEXECDIR +export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR install-man: $(Q) $(MAKE) -C Documentation install @@ -40,6 +41,10 @@ install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu +install-dummy_amdgpu_plugin: dummy_amdgpu_plugin + $(Q) $(MAKE) -C plugins/amdgpu install +.PHONY: install-dummy_amdgpu_plugin + install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install @@ -54,4 +59,5 @@ uninstall: $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ + $(Q) $(MAKE) -C plugins/amdgpu $@ .PHONY: uninstall diff --git a/README.md b/README.md index fd86b2c159..6b86cac9e1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ ![X86_64 GCC Test](https://github.com/checkpoint-restore/criu/workflows/X86_64%20GCC%20Test/badge.svg) -![Podman Test](https://github.com/checkpoint-restore/criu/workflows/Podman%20Test/badge.svg) +![Docker Test](https://github.com/checkpoint-restore/criu/actions/workflows/docker-test.yml/badge.svg) +![Podman Test](https://github.com/checkpoint-restore/criu/actions/workflows/podman-test.yml/badge.svg) [![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)](https://circleci.com/gh/checkpoint-restore/criu)

diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 7cfa637ebe..bd1ed0da35 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -19,7 +19,7 @@ unsigned __page_shift = 0; */ const char code_syscall[] = { 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ - 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ + 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 6715afdb3c..7700f52caf 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -18,7 +18,7 @@ */ const char code_syscall[] = { 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ - 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ + 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); diff --git a/compel/arch/mips/src/lib/include/ldsodefs.h b/compel/arch/mips/src/lib/include/ldsodefs.h index 8cfde24962..97e79755d4 100644 --- a/compel/arch/mips/src/lib/include/ldsodefs.h +++ b/compel/arch/mips/src/lib/include/ldsodefs.h @@ -69,8 +69,8 @@ struct La_mips_64_retval; /* An entry in a 64 bit SHT_REL section. */ typedef struct { - Elf32_Word r_sym; /* Symbol index */ - unsigned char r_ssym; /* Special symbol for 2nd relocation */ + Elf32_Word r_sym; /* Symbol index */ + unsigned char r_ssym; /* Special symbol for 2nd relocation */ unsigned char r_type3; /* 3rd relocation type */ unsigned char r_type2; /* 2nd relocation type */ unsigned char r_type1; /* 1st relocation type */ @@ -82,14 +82,14 @@ typedef union { } _Elf64_Mips_R_Info_union; typedef struct { - Elf64_Addr r_offset; /* Address */ + Elf64_Addr r_offset; /* Address */ _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ } Elf64_Mips_Rel; typedef struct { - Elf64_Addr r_offset; /* Address */ + Elf64_Addr r_offset; /* Address */ _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ - Elf64_Sxword r_addend; /* Addend */ + Elf64_Sxword r_addend; /* Addend */ } Elf64_Mips_Rela; #define ELF64_MIPS_R_SYM(i) ((__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_sym) diff --git a/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h b/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h index 82ae6096b7..6db1ddbd30 100644 --- a/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h +++ b/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h @@ -52,14 +52,14 @@ typedef struct siginfo { /* kill() */ struct { - __kernel_pid_t _pid; /* sender's pid */ + __kernel_pid_t _pid; /* sender's pid */ __ARCH_SI_UID_T _uid; /* sender's uid */ } _kill; /* POSIX.1b timers */ struct { __kernel_timer_t _tid; /* timer id */ - int _overrun; /* overrun count */ + int _overrun; /* overrun count */ char _pad[sizeof(__ARCH_SI_UID_T) - sizeof(int)]; sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ @@ -67,16 +67,16 @@ typedef struct siginfo { /* POSIX.1b signals */ struct { - __kernel_pid_t _pid; /* sender's pid */ + __kernel_pid_t _pid; /* sender's pid */ __ARCH_SI_UID_T _uid; /* sender's uid */ sigval_t _sigval; } _rt; /* SIGCHLD */ struct { - __kernel_pid_t _pid; /* which child */ + __kernel_pid_t _pid; /* which child */ __ARCH_SI_UID_T _uid; /* sender's uid */ - int _status; /* exit code */ + int _status; /* exit code */ __ARCH_SI_CLOCK_T _utime; __ARCH_SI_CLOCK_T _stime; } _sigchld; @@ -104,8 +104,8 @@ typedef struct siginfo { /* SIGSYS */ struct { - void *_call_addr; /* calling user insn */ - int _syscall; /* triggering system call number */ + void *_call_addr; /* calling user insn */ + int _syscall; /* triggering system call number */ unsigned int _arch; /* AUDIT_ARCH_* of syscall */ } _sigsys; } _sifields; diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index 68d0a27285..afa0f5ed5f 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -24,7 +24,7 @@ */ const char code_syscall[] = { 0x0c, 0x00, 0x00, 0x00, /* syscall */ - 0x0d, 0x00, 0x00, 0x00 /* break */ + 0x0d, 0x00, 0x00, 0x00 /* break */ }; /* 10-byte legacy floating point register */ diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h index fe6192e207..8cf8a135fb 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h @@ -21,13 +21,13 @@ typedef struct { unsigned long xer; unsigned long ccr; unsigned long softe; /* Soft enabled/disabled */ - unsigned long trap; /* Reason for being here */ + unsigned long trap; /* Reason for being here */ /* * N.B. for critical exceptions on 4xx, the dar and dsisr * fields are overloaded to hold srr0 and srr1. */ - unsigned long dar; /* Fault registers */ - unsigned long dsisr; /* on 4xx/Book-E used for ESR */ + unsigned long dar; /* Fault registers */ + unsigned long dsisr; /* on 4xx/Book-E used for ESR */ unsigned long result; /* Result of a system call */ } user_regs_struct_t; diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index fc174d0dd2..61cd6e9857 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -30,7 +30,7 @@ unsigned __page_shift = 0; */ const uint32_t code_syscall[] = { 0x44000002, /* sc */ - 0x0fe00000 /* twi 31,0,0 */ + 0x0fe00000 /* twi 31,0,0 */ }; static inline __always_unused void __check_code_syscall(void) diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 77ace713a6..3cd25e71d8 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -624,8 +624,8 @@ enum kernel_ts_level { }; /* See arch/s390/include/asm/processor.h */ -#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ -#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ +#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ +#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ #define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ /* diff --git a/compel/arch/x86/plugins/std/syscalls/syscall32.c b/compel/arch/x86/plugins/std/syscalls/syscall32.c index 0f2fec3ff8..d09fd38c71 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall32.c +++ b/compel/arch/x86/plugins/std/syscalls/syscall32.c @@ -1,9 +1,9 @@ #include "asm/types.h" #include "syscall-32.h" -#define SYS_SOCKET 1 /* sys_socket(2) */ -#define SYS_BIND 2 /* sys_bind(2) */ -#define SYS_CONNECT 3 /* sys_connect(2) */ +#define SYS_SOCKET 1 /* sys_socket(2) */ +#define SYS_BIND 2 /* sys_bind(2) */ +#define SYS_CONNECT 3 /* sys_connect(2) */ #define SYS_SENDTO 11 /* sys_sendto(2) */ #define SYS_RECVFROM 12 /* sys_recvfrom(2) */ #define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h index 8d54516af4..63ff83dbeb 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -43,16 +43,16 @@ enum cpuid_leafs { #define NCAPINTS_BITS (NCAPINTS * 32) /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU (0 * 32 + 0) /* Onboard FPU */ -#define X86_FEATURE_VME (0 * 32 + 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE (0 * 32 + 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE (0 * 32 + 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC (0 * 32 + 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR (0 * 32 + 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE (0 * 32 + 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0 * 32 + 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 (0 * 32 + 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC (0 * 32 + 9) /* Onboard APIC */ +#define X86_FEATURE_FPU (0 * 32 + 0) /* Onboard FPU */ +#define X86_FEATURE_VME (0 * 32 + 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE (0 * 32 + 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE (0 * 32 + 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC (0 * 32 + 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR (0 * 32 + 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE (0 * 32 + 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE (0 * 32 + 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 (0 * 32 + 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC (0 * 32 + 9) /* Onboard APIC */ #define X86_FEATURE_SEP (0 * 32 + 11) /* SYSENTER/SYSEXIT */ #define X86_FEATURE_MTRR (0 * 32 + 12) /* Memory Type Range Registers */ #define X86_FEATURE_PGE (0 * 32 + 13) /* Page Global Enable */ @@ -100,12 +100,12 @@ enum cpuid_leafs { #define X86_FEATURE_CENTAUR_MCR (3 * 32 + 3) /* Centaur MCRs (= MTRRs) */ /* CPU types for specific tunings: */ -#define X86_FEATURE_K8 (3 * 32 + 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 (3 * 32 + 5) /* "" Athlon */ -#define X86_FEATURE_P3 (3 * 32 + 6) /* "" P3 */ -#define X86_FEATURE_P4 (3 * 32 + 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC (3 * 32 + 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP (3 * 32 + 9) /* SMP kernel running on UP */ +#define X86_FEATURE_K8 (3 * 32 + 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 (3 * 32 + 5) /* "" Athlon */ +#define X86_FEATURE_P3 (3 * 32 + 6) /* "" P3 */ +#define X86_FEATURE_P4 (3 * 32 + 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC (3 * 32 + 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP (3 * 32 + 9) /* SMP kernel running on UP */ #define X86_FEATURE_ART (3 * 32 + 10) /* Always running timer (ART) */ #define X86_FEATURE_ARCH_PERFMON (3 * 32 + 11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS (3 * 32 + 12) /* Precise-Event Based Sampling */ @@ -129,16 +129,16 @@ enum cpuid_leafs { #define X86_FEATURE_TSC_KNOWN_FREQ (3 * 32 + 31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ -#define X86_FEATURE_XMM3 (4 * 32 + 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 (4 * 32 + 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT (4 * 32 + 3) /* "monitor" MONITOR/MWAIT support */ -#define X86_FEATURE_DSCPL (4 * 32 + 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX (4 * 32 + 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX (4 * 32 + 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST (4 * 32 + 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 (4 * 32 + 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 (4 * 32 + 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_XMM3 (4 * 32 + 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 (4 * 32 + 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT (4 * 32 + 3) /* "monitor" MONITOR/MWAIT support */ +#define X86_FEATURE_DSCPL (4 * 32 + 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ +#define X86_FEATURE_VMX (4 * 32 + 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX (4 * 32 + 6) /* Safer Mode eXtensions */ +#define X86_FEATURE_EST (4 * 32 + 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 (4 * 32 + 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 (4 * 32 + 9) /* Supplemental SSE-3 */ #define X86_FEATURE_CID (4 * 32 + 10) /* Context ID */ #define X86_FEATURE_SDBG (4 * 32 + 11) /* Silicon Debug */ #define X86_FEATURE_FMA (4 * 32 + 12) /* Fused multiply-add */ @@ -162,28 +162,28 @@ enum cpuid_leafs { #define X86_FEATURE_HYPERVISOR (4 * 32 + 31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE (5 * 32 + 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN (5 * 32 + 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT (5 * 32 + 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN (5 * 32 + 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 (5 * 32 + 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN (5 * 32 + 9) /* ACE v2 enabled */ +#define X86_FEATURE_XSTORE (5 * 32 + 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN (5 * 32 + 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT (5 * 32 + 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN (5 * 32 + 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 (5 * 32 + 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN (5 * 32 + 9) /* ACE v2 enabled */ #define X86_FEATURE_PHE (5 * 32 + 10) /* PadLock Hash Engine */ #define X86_FEATURE_PHE_EN (5 * 32 + 11) /* PHE enabled */ #define X86_FEATURE_PMM (5 * 32 + 12) /* PadLock Montgomery Multiplier */ #define X86_FEATURE_PMM_EN (5 * 32 + 13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM (6 * 32 + 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY (6 * 32 + 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM (6 * 32 + 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC (6 * 32 + 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY (6 * 32 + 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM (6 * 32 + 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A (6 * 32 + 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE (6 * 32 + 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH (6 * 32 + 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW (6 * 32 + 9) /* OS Visible Workaround */ +#define X86_FEATURE_LAHF_LM (6 * 32 + 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY (6 * 32 + 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM (6 * 32 + 2) /* Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC (6 * 32 + 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY (6 * 32 + 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM (6 * 32 + 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A (6 * 32 + 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE (6 * 32 + 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH (6 * 32 + 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW (6 * 32 + 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6 * 32 + 10) /* Instruction Based Sampling */ #define X86_FEATURE_XOP (6 * 32 + 11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6 * 32 + 12) /* SKINIT/STGI instructions */ @@ -202,14 +202,14 @@ enum cpuid_leafs { #define X86_FEATURE_MWAITX (6 * 32 + 29) /* MWAIT extension (MONITORX/MWAITX instructions) */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE (9 * 32 + 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST (9 * 32 + 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_BMI1 (9 * 32 + 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE (9 * 32 + 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 (9 * 32 + 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP (9 * 32 + 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 (9 * 32 + 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS (9 * 32 + 9) /* Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_FSGSBASE (9 * 32 + 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST (9 * 32 + 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_BMI1 (9 * 32 + 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE (9 * 32 + 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 (9 * 32 + 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP (9 * 32 + 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 (9 * 32 + 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS (9 * 32 + 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID (9 * 32 + 10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM (9 * 32 + 11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM (9 * 32 + 12) /* Cache QoS Monitoring */ @@ -238,14 +238,14 @@ enum cpuid_leafs { #define X86_FEATURE_XSAVES (10 * 32 + 3) /* XSAVES/XRSTORS instructions */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 11 */ -#define X86_FEATURE_PREFETCHWT1 (11 * 32 + 0) /* PREFETCHWT1 IntelĀ® Xeon PhiTM only */ -#define X86_FEATURE_AVX512VBMI (11 * 32 + 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (11 * 32 + 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ +#define X86_FEATURE_PREFETCHWT1 (11 * 32 + 0) /* PREFETCHWT1 IntelĀ® Xeon PhiTM only */ +#define X86_FEATURE_AVX512VBMI (11 * 32 + 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (11 * 32 + 2) /* User Mode Instruction Protection */ +#define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ #define X86_FEATURE_AVX512_VNNI (11 * 32 + 11) /* Vector Neural Network Instructions */ #define X86_FEATURE_AVX512_BITALG (11 * 32 + 12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ @@ -261,35 +261,35 @@ enum cpuid_leafs { #define X86_FEATURE_CQM_MBM_LOCAL (12 * 32 + 2) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13 * 32 + 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13 * 32 + 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13 * 32 + 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_CLZERO (13 * 32 + 0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13 * 32 + 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13 * 32 + 2) /* Always save/restore FP error pointers */ #define X86_FEATURE_IBPB (13 * 32 + 12) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_IBRS (13 * 32 + 14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_STIBP (13 * 32 + 15) /* Single Thread Indirect Branch Predictors */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14 * 32 + 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14 * 32 + 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14 * 32 + 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14 * 32 + 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14 * 32 + 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14 * 32 + 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14 * 32 + 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14 * 32 + 9) /* HWP Activity Window */ +#define X86_FEATURE_DTHERM (14 * 32 + 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14 * 32 + 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14 * 32 + 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14 * 32 + 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14 * 32 + 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14 * 32 + 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14 * 32 + 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14 * 32 + 9) /* HWP Activity Window */ #define X86_FEATURE_HWP_EPP (14 * 32 + 10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14 * 32 + 11) /* HWP Package Level Request */ #define X86_FEATURE_HDC (14 * 32 + 13) /* HDC base registers present */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15 * 32 + 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15 * 32 + 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15 * 32 + 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15 * 32 + 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15 * 32 + 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15 * 32 + 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15 * 32 + 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15 * 32 + 7) /* Decode Assists support */ +#define X86_FEATURE_NPT (15 * 32 + 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15 * 32 + 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15 * 32 + 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15 * 32 + 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15 * 32 + 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15 * 32 + 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15 * 32 + 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15 * 32 + 7) /* Decode Assists support */ #define X86_FEATURE_PAUSEFILTER (15 * 32 + 10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15 * 32 + 12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15 * 32 + 13) /* Virtual Interrupt Controller */ @@ -305,8 +305,8 @@ enum cpuid_leafs { #define X86_FEATURE_SMCA (17 * 32 + 3) /* Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18 * 32 + 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18 * 32 + 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_AVX512_4VNNIW (18 * 32 + 2) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18 * 32 + 3) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_PCONFIG (18 * 32 + 18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18 * 32 + 26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18 * 32 + 27) /* "" Single Thread Indirect Branch Predictors */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index d740e3c04a..c8ebda0970 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -80,6 +80,11 @@ enum xfeature { (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | XFEATURE_MASK_PKRU | XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) +/* xsave structure features which is safe to fill with garbage (see validate_random_xstate()) */ +#define XFEATURE_MASK_FAULTINJ \ + (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM) + struct fpx_sw_bytes { uint32_t magic1; uint32_t extended_size; @@ -105,7 +110,7 @@ struct i387_fxsave_struct { uint32_t fos; /* FPU Operand Selector */ }; }; - uint32_t mxcsr; /* MXCSR Register State */ + uint32_t mxcsr; /* MXCSR Register State */ uint32_t mxcsr_mask; /* MXCSR Mask */ /* 8*16 bytes for each FP-reg = 128 bytes */ @@ -277,13 +282,13 @@ typedef struct { } fpu_state_64_t; struct user_i387_ia32_struct { - uint32_t cwd; /* FPU Control Word */ - uint32_t swd; /* FPU Status Word */ - uint32_t twd; /* FPU Tag Word */ - uint32_t fip; /* FPU IP Offset */ - uint32_t fcs; /* FPU IP Selector */ - uint32_t foo; /* FPU Operand Pointer Offset */ - uint32_t fos; /* FPU Operand Pointer Selector */ + uint32_t cwd; /* FPU Control Word */ + uint32_t swd; /* FPU Status Word */ + uint32_t twd; /* FPU Tag Word */ + uint32_t fip; /* FPU IP Offset */ + uint32_t fcs; /* FPU IP Selector */ + uint32_t foo; /* FPU Operand Pointer Offset */ + uint32_t fos; /* FPU Operand Pointer Selector */ uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ }; diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 1e344bf3af..de9013c275 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -34,12 +34,12 @@ * Injected syscall instruction */ const char code_syscall[] = { - 0x0f, 0x05, /* syscall */ + 0x0f, 0x05, /* syscall */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; const char code_int_80[] = { - 0xcd, 0x80, /* int $0x80 */ + 0xcd, 0x80, /* int $0x80 */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; @@ -254,6 +254,7 @@ static void validate_random_xstate(struct xsave_struct *xsave) /* No unknown or supervisor features may be set */ hdr->xstate_bv &= XFEATURE_MASK_USER; hdr->xstate_bv &= ~XFEATURE_MASK_SUPERVISOR; + hdr->xstate_bv &= XFEATURE_MASK_FAULTINJ; for (i = 0; i < XFEATURE_MAX; i++) { if (!compel_fpu_has_feature(i)) @@ -282,10 +283,10 @@ static int corrupt_extregs(pid_t pid) bool use_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); user_fpregs_struct_t ext_regs; int *rand_to = (int *)&ext_regs; - unsigned int seed; + unsigned int seed, init_seed; size_t i; - seed = time(NULL); + init_seed = seed = time(NULL); for (i = 0; i < sizeof(ext_regs) / sizeof(int); i++) *rand_to++ = rand_r(&seed); @@ -295,7 +296,7 @@ static int corrupt_extregs(pid_t pid) * - zdtm.py will grep it auto-magically from logs * (and the seed will be known from an automatical testing) */ - pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, seed); + pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); if (!use_xsave) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h index 1c03f44861..9d34428393 100644 --- a/compel/include/infect-priv.h +++ b/compel/include/infect-priv.h @@ -38,7 +38,7 @@ struct parasite_ctl { unsigned long parasite_ip; /* service routine start ip */ unsigned int *cmd; /* address for command */ - void *args; /* address for arguments */ + void *args; /* address for arguments */ unsigned long args_size; int tsock; /* transport socket for transferring fds */ diff --git a/compel/include/rpc-pie-priv.h b/compel/include/rpc-pie-priv.h index 2a239c6134..5a6b337b22 100644 --- a/compel/include/rpc-pie-priv.h +++ b/compel/include/rpc-pie-priv.h @@ -3,7 +3,7 @@ struct ctl_msg { uint32_t cmd; /* command itself */ uint32_t ack; /* ack on command */ - int32_t err; /* error code on reply */ + int32_t err; /* error code on reply */ }; #define ctl_msg_cmd(_cmd) \ diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index c3d2ee6a69..7fa0bd8a0d 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -106,7 +106,7 @@ struct infect_ctx { unsigned long task_size; unsigned long syscall_ip; /* entry point of infection */ - unsigned long flags; /* fine-tune (e.g. faults) */ + unsigned long flags; /* fine-tune (e.g. faults) */ void (*child_handler)(int, siginfo_t *, void *); /* hander for SIGCHLD deaths */ struct sigaction orig_handler; diff --git a/compel/include/uapi/loglevels.h b/compel/include/uapi/loglevels.h index e76c156578..7a49825d24 100644 --- a/compel/include/uapi/loglevels.h +++ b/compel/include/uapi/loglevels.h @@ -7,10 +7,10 @@ */ enum __compel_log_levels { - COMPEL_LOG_MSG, /* Print message regardless of log level */ + COMPEL_LOG_MSG, /* Print message regardless of log level */ COMPEL_LOG_ERROR, /* Errors only, when we're in trouble */ - COMPEL_LOG_WARN, /* Warnings */ - COMPEL_LOG_INFO, /* Informative, everything is fine */ + COMPEL_LOG_WARN, /* Warnings */ + COMPEL_LOG_INFO, /* Informative, everything is fine */ COMPEL_LOG_DEBUG, /* Debug only */ COMPEL_DEFAULT_LOGLEVEL = COMPEL_LOG_WARN diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index c5291d20d3..533e0569ff 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -62,7 +62,7 @@ */ typedef struct { uint64_t filter_off; /* Input: which filter */ - uint64_t flags; /* Output: filter's flags */ + uint64_t flags; /* Output: filter's flags */ } seccomp_metadata_t; #ifdef PTRACE_EVENT_STOP diff --git a/compel/plugins/std/string.c b/compel/plugins/std/string.c index bde1bc68b9..d67e0d1a9d 100644 --- a/compel/plugins/std/string.c +++ b/compel/plugins/std/string.c @@ -151,7 +151,12 @@ static unsigned int __conv_val(unsigned char c) if (__isdigit(c)) return c - '0'; else if (__isalpha(c)) - return &conv_tab[__tolower(c)] - conv_tab; + /** + * If we want the value of something which __isalpha() == true + * it has to be base > 10. 'A' = 10, 'B' = 11 ... 'Z' = 35 + */ + return __tolower(c) - 'a' + 10; + return -1u; } diff --git a/compel/src/lib/handle-elf.c b/compel/src/lib/handle-elf.c index 9662751e0f..22c8f29786 100644 --- a/compel/src/lib/handle-elf.c +++ b/compel/src/lib/handle-elf.c @@ -554,7 +554,7 @@ int __handle_elf(void *mem, size_t size) #endif /* ELF_PPC64 */ #ifdef ELF_X86_64 - case R_X86_64_32: /* Symbol + Addend (4 bytes) */ + case R_X86_64_32: /* Symbol + Addend (4 bytes) */ case R_X86_64_32S: /* Symbol + Addend (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " diff --git a/compel/src/main.c b/compel/src/main.c index a9a50959f9..f461ff04d1 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -19,6 +19,7 @@ #define CFLAGS_DEFAULT_SET \ "-Wstrict-prototypes " \ + "-ffreestanding " \ "-fno-stack-protector -nostdlib -fomit-frame-pointer " #define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" diff --git a/coredump/coredump-python2 b/coredump/coredump-python2 new file mode 100755 index 0000000000..564c05ce9f --- /dev/null +++ b/coredump/coredump-python2 @@ -0,0 +1,6 @@ +#!/usr/bin/env python2 + +import coredump + +if __name__ == '__main__': + coredump.main() diff --git a/coredump/coredump-python3 b/coredump/coredump-python3 new file mode 100755 index 0000000000..3032dbadf1 --- /dev/null +++ b/coredump/coredump-python3 @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +import coredump + +if __name__ == '__main__': + coredump.main() diff --git a/coredump/coredump.py b/coredump/coredump.py new file mode 100644 index 0000000000..5e63d21385 --- /dev/null +++ b/coredump/coredump.py @@ -0,0 +1,41 @@ +import argparse +import os + +import criu_coredump + + +def coredump(opts): + generator = criu_coredump.coredump_generator() + cores = generator(os.path.realpath(opts['in'])) + for pid in cores: + if opts['pid'] and pid != opts['pid']: + continue + with open(os.path.realpath(opts['out']) + "/core." + str(pid), 'wb+') as f: + cores[pid].write(f) + + +def main(): + desc = 'CRIU core dump' + parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument('-i', + '--in', + default='.', + help='directory where to get images from') + parser.add_argument('-p', + '--pid', + type=int, + help='generate coredump for specific pid(all pids py default)') + parser.add_argument('-o', + '--out', + default='.', + help='directory to write coredumps to') + + opts = vars(parser.parse_args()) + + coredump(opts) + + +if __name__ == '__main__': + main() diff --git a/coredump/criu-coredump b/coredump/criu-coredump deleted file mode 100755 index 25c188c6bc..0000000000 --- a/coredump/criu-coredump +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python2 -import argparse -import os - -import criu_coredump - -def coredump(opts): - generator = criu_coredump.coredump_generator() - cores = generator(os.path.realpath(opts['in'])) - for pid in cores: - if opts['pid'] and pid != opts['pid']: - continue - with open(os.path.realpath(opts['out'])+"/core."+str(pid), 'w+') as f: - cores[pid].write(f) - - -def main(): - desc = 'CRIU core dump' - parser = argparse.ArgumentParser(description=desc, - formatter_class=argparse.RawTextHelpFormatter) - - parser.add_argument('-i', - '--in', - default = '.', - help = 'directory where to get images from') - parser.add_argument('-p', - '--pid', - type = int, - help = 'generate coredump for specific pid(all pids py default)') - parser.add_argument('-o', - '--out', - default = '.', - help = 'directory to write coredumps to') - - opts = vars(parser.parse_args()) - - coredump(opts) - -if __name__ == '__main__': - main() diff --git a/coredump/criu_coredump/__init__.py b/coredump/criu_coredump/__init__.py index 213af42ec6..c1a437cf42 100644 --- a/coredump/criu_coredump/__init__.py +++ b/coredump/criu_coredump/__init__.py @@ -1,2 +1 @@ -from coredump import * -import elf +from .coredump import coredump_generator diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index b37ef22913..881c40b0a9 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -29,9 +29,17 @@ # 4) VMAs contents; # import io -import elf +import sys import ctypes + from pycriu import images +from . import elf + + +try: + from itertools import ifilter as filter +except ImportError: + pass # Some memory-related constants PAGESIZE = 4096 @@ -88,7 +96,7 @@ def write(self, f): for note in self.notes: buf.write(note.nhdr) buf.write(note.owner) - buf.write("\0" * (8 - len(note.owner))) + buf.write(b"\0" * (8 - len(note.owner))) buf.write(note.data) offset = ctypes.sizeof(elf.Elf64_Ehdr()) @@ -136,7 +144,7 @@ def _img_open_and_strip(self, name, single=False, pid=None): path += "-" + str(pid) path += ".img" - with open(path) as f: + with open(path, 'rb') as f: img = images.load(f) if single: @@ -177,7 +185,7 @@ def write(self, coredumps_dir, pid=None): for p in self.coredumps: if pid and p != pid: continue - with open(coredumps_dir + "/" + "core." + str(p), 'w+') as f: + with open(coredumps_dir + "/" + "core." + str(p), 'wb+') as f: self.coredumps[p].write(f) def _gen_coredump(self, pid): @@ -215,7 +223,7 @@ def _gen_ehdr(self, pid, phdrs): ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) - #FIXME Case len(phdrs) > PN_XNUM should be handled properly. + # FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) @@ -295,7 +303,7 @@ def _gen_prpsinfo(self, pid): prpsinfo.pr_state = 3 # Don't even ask me why it is so, just borrowed from linux # source and made pr_state match. - prpsinfo.pr_sname = '.' if prpsinfo.pr_state > 5 else "RSDTZW" [ + prpsinfo.pr_sname = b'.' if prpsinfo.pr_state > 5 else b"RSDTZW" [ prpsinfo.pr_state] prpsinfo.pr_zomb = 1 if prpsinfo.pr_state == 4 else 0 prpsinfo.pr_nice = core["thread_core"][ @@ -307,8 +315,11 @@ def _gen_prpsinfo(self, pid): prpsinfo.pr_ppid = pstree["ppid"] prpsinfo.pr_pgrp = pstree["pgid"] prpsinfo.pr_sid = pstree["sid"] - prpsinfo.pr_fname = core["tc"]["comm"] prpsinfo.pr_psargs = self._gen_cmdline(pid) + if (sys.version_info > (3, 0)): + prpsinfo.pr_fname = core["tc"]["comm"].encode() + else: + prpsinfo.pr_fname = core["tc"]["comm"] nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -317,7 +328,7 @@ def _gen_prpsinfo(self, pid): note = elf_note() note.data = prpsinfo - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -334,7 +345,7 @@ def _gen_prstatus(self, pid, tid): ctypes.memset(ctypes.addressof(prstatus), 0, ctypes.sizeof(prstatus)) - #FIXME setting only some of the fields for now. Revisit later. + # FIXME setting only some of the fields for now. Revisit later. prstatus.pr_pid = tid prstatus.pr_ppid = pstree["ppid"] prstatus.pr_pgrp = pstree["pgid"] @@ -375,7 +386,7 @@ def _gen_prstatus(self, pid, tid): note = elf_note() note.data = prstatus - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -402,7 +413,6 @@ def _gen_fpregset(self, pid, tid): *regs["st_space"]) fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( *regs["xmm_space"]) - #fpregset.padding = regs["padding"] unused nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -411,7 +421,7 @@ def _gen_fpregset(self, pid, tid): note = elf_note() note.data = fpregset - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -452,7 +462,7 @@ def _gen_x86_xstate(self, pid, tid): note = elf_note() note.data = data - note.owner = "LINUX" + note.owner = b"LINUX" note.nhdr = nhdr return note @@ -472,7 +482,7 @@ def _gen_siginfo(self, pid, tid): note = elf_note() note.data = siginfo - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -482,7 +492,7 @@ def _gen_auxv(self, pid): Generate NT_AUXV note for thread tid of process pid. """ mm = self.mms[pid] - num_auxv = len(mm["mm_saved_auxv"]) / 2 + num_auxv = len(mm["mm_saved_auxv"]) // 2 class elf_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] @@ -499,7 +509,7 @@ class elf_auxv(ctypes.Structure): note = elf_note() note.data = auxv - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -523,10 +533,10 @@ class mmaped_file_info: continue shmid = vma["shmid"] - off = vma["pgoff"] / PAGESIZE + off = vma["pgoff"] // PAGESIZE files = self.reg_files - fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] + fname = next(filter(lambda x: x["id"] == shmid, files))["name"] info = mmaped_file_info() info.start = vma["start"] @@ -569,17 +579,20 @@ class elf_files(ctypes.Structure): setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - setattr(data, "name" + str(i), info.name) + if (sys.version_info > (3, 0)): + setattr(data, "name" + str(i), info.name.encode()) + else: + setattr(data, "name" + str(i), info.name) nhdr = elf.Elf64_Nhdr() - nhdr.n_namesz = 5 #XXX strlen + 1 + nhdr.n_namesz = 5 # strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) nhdr.n_type = elf.NT_FILE note = elf_note() note.nhdr = nhdr - note.owner = "CORE" + note.owner = b"CORE" note.data = data return note @@ -640,11 +653,11 @@ def _get_page(self, pid, page_no): if not found: continue - if "in_parent" in m and m["in_parent"] == True: + if "in_parent" in m and m["in_parent"]: ppid = self.pstree[pid]["ppid"] return self._get_page(ppid, page_no) else: - with open(self._imgs_dir + "/pages-%s.img" % pages_id) as f: + with open(self._imgs_dir + "/pages-%s.img" % pages_id, 'rb') as f: f.seek(off * PAGESIZE) return f.read(PAGESIZE) @@ -657,16 +670,16 @@ def _gen_mem_chunk(self, pid, vma, size): f = None if size == 0: - return "" + return b"" if vma["status"] & status["VMA_AREA_VVAR"]: - #FIXME this is what gdb does, as vvar vma + # FIXME this is what gdb does, as vvar vma # is not readable from userspace? - return "\0" * size + return b"\0" * size elif vma["status"] & status["VMA_AREA_VSYSCALL"]: - #FIXME need to dump it with criu or read from + # FIXME need to dump it with criu or read from # current process. - return "\0" * size + return b"\0" * size if vma["status"] & status["VMA_FILE_SHARED"] or \ vma["status"] & status["VMA_FILE_PRIVATE"]: @@ -675,9 +688,9 @@ def _gen_mem_chunk(self, pid, vma, size): off = vma["pgoff"] files = self.reg_files - fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] + fname = next(filter(lambda x: x["id"] == shmid, files))["name"] - f = open(fname) + f = open(fname, 'rb') f.seek(off) start = vma["start"] @@ -699,10 +712,10 @@ def _gen_mem_chunk(self, pid, vma, size): # a file, and changed ones -- from pages.img. # Finally, if no page is found neither in pages.img nor # in file, hole in inserted -- a page filled with zeroes. - start_page = start / PAGESIZE - end_page = end / PAGESIZE + start_page = start // PAGESIZE + end_page = end // PAGESIZE - buf = "" + buf = b"" for page_no in range(start_page, end_page + 1): page = None @@ -710,17 +723,17 @@ def _gen_mem_chunk(self, pid, vma, size): # and choose appropriate. page_mem = self._get_page(pid, page_no) - if f != None: + if f is not None: page = f.read(PAGESIZE) - if page_mem != None: + if page_mem is not None: # Page from pages.img has higher priority # than one from maped file on disk. page = page_mem if page is None: # Hole - page = PAGESIZE * "\0" + page = PAGESIZE * b"\0" # If it is a start or end page, we need to read # only part of it. @@ -740,7 +753,7 @@ def _gen_mem_chunk(self, pid, vma, size): buf += page[n_skip:n_skip + n_read] # Don't forget to close file. - if f != None: + if f is not None: f.close() return buf @@ -762,25 +775,25 @@ def _gen_cmdline(self, pid): chunk = self._gen_mem_chunk(pid, vma, size) # Replace all '\0's with spaces. - return chunk.replace('\0', ' ') + return chunk.replace(b'\0', b' ') def _get_vma_dump_size(self, vma): """ Calculate amount of vma to put into core dump. """ - if vma["status"] & status["VMA_AREA_VVAR"] or \ - vma["status"] & status["VMA_AREA_VSYSCALL"] or \ - vma["status"] & status["VMA_AREA_VDSO"]: + if (vma["status"] & status["VMA_AREA_VVAR"] or + vma["status"] & status["VMA_AREA_VSYSCALL"] or + vma["status"] & status["VMA_AREA_VDSO"]): size = vma["end"] - vma["start"] elif vma["prot"] == 0: size = 0 - elif vma["prot"] & prot["PROT_READ"] and \ - vma["prot"] & prot["PROT_EXEC"]: + elif (vma["prot"] & prot["PROT_READ"] and + vma["prot"] & prot["PROT_EXEC"]): size = PAGESIZE - elif vma["status"] & status["VMA_ANON_SHARED"] or \ - vma["status"] & status["VMA_FILE_SHARED"] or \ - vma["status"] & status["VMA_ANON_PRIVATE"] or \ - vma["status"] & status["VMA_FILE_PRIVATE"]: + elif (vma["status"] & status["VMA_ANON_SHARED"] or + vma["status"] & status["VMA_FILE_SHARED"] or + vma["status"] & status["VMA_ANON_PRIVATE"] or + vma["status"] & status["VMA_FILE_PRIVATE"]): size = vma["end"] - vma["start"] else: size = 0 @@ -819,8 +832,6 @@ class vma_class: vmas = [] for vma in mm["vmas"]: - size = self._get_vma_dump_size(vma) - v = vma_class() v.filesz = self._get_vma_dump_size(vma) v.data = self._gen_mem_chunk(pid, vma, v.filesz) diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index e65919e6b8..092b478575 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -16,16 +16,13 @@ ELFMAG0 = 0x7f # #define ELFMAG0 0x7f /* Magic number byte 0 */ EI_MAG1 = 1 # #define EI_MAG1 1 /* File identification byte 1 index */ -ELFMAG1 = ord( - 'E') # #define ELFMAG1 'E' /* Magic number byte 1 */ +ELFMAG1 = ord('E') # #define ELFMAG1 'E' /* Magic number byte 1 */ EI_MAG2 = 2 # #define EI_MAG2 2 /* File identification byte 2 index */ -ELFMAG2 = ord( - 'L') # #define ELFMAG2 'L' /* Magic number byte 2 */ +ELFMAG2 = ord('L') # #define ELFMAG2 'L' /* Magic number byte 2 */ EI_MAG3 = 3 # #define EI_MAG3 3 /* File identification byte 3 index */ -ELFMAG3 = ord( - 'F') # #define ELFMAG3 'F' /* Magic number byte 3 */ +ELFMAG3 = ord('F') # #define ELFMAG3 'F' /* Magic number byte 3 */ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index */ @@ -48,22 +45,22 @@ class Elf64_Ehdr(ctypes.Structure): # typedef struct - _fields_ = [ # { + _fields_ = [ ("e_ident", - ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; - ("e_type", Elf64_Half), # Elf64_Half e_type; - ("e_machine", Elf64_Half), # Elf64_Half e_machine; - ("e_version", Elf64_Word), # Elf64_Word e_version; - ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; - ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; - ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; - ("e_flags", Elf64_Word), # Elf64_Word e_flags; - ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; - ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; - ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; - ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; - ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; - ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf64_Half), # Elf64_Half e_type; + ("e_machine", Elf64_Half), # Elf64_Half e_machine; + ("e_version", Elf64_Word), # Elf64_Word e_version; + ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; + ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; + ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; + ("e_flags", Elf64_Word), # Elf64_Word e_flags; + ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; + ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; + ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; + ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; + ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; + ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; ] # } Elf64_Ehdr; @@ -80,15 +77,15 @@ class Elf64_Ehdr(ctypes.Structure): # typedef struct class Elf64_Phdr(ctypes.Structure): # typedef struct - _fields_ = [ # { - ("p_type", Elf64_Word), # Elf64_Word p_type; - ("p_flags", Elf64_Word), # Elf64_Word p_flags; - ("p_offset", Elf64_Off), # Elf64_Off p_offset; - ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; - ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; - ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; - ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; - ("p_align", Elf64_Xword), # Elf64_Xword p_align; + _fields_ = [ + ("p_type", Elf64_Word), # Elf64_Word p_type; + ("p_flags", Elf64_Word), # Elf64_Word p_flags; + ("p_offset", Elf64_Off), # Elf64_Off p_offset; + ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; + ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; + ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; + ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; + ("p_align", Elf64_Xword), # Elf64_Xword p_align; ] # } Elf64_Phdr; @@ -100,78 +97,89 @@ class _Elf64_auxv_t_U(ctypes.Union): class Elf64_auxv_t(ctypes.Structure): # typedef struct - _fields_ = [ # { + _fields_ = [ ("a_type", - ctypes.c_uint64), # uint64_t a_type; /* Entry type */ - ("a_un", _Elf64_auxv_t_U) # union - # { - # uint64_t a_val; /* Integer value */ - # /* We use to have pointer elements added here. We cannot do that, - # though, since it does not work when using 32-bit definitions - # on 64-bit platforms and vice versa. */ - # } a_un; + ctypes.c_uint64), # uint64_t a_type; /* Entry type */ + ("a_un", _Elf64_auxv_t_U) # union + + # uint64_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; ] # } Elf64_auxv_t; # Elf64_Nhdr related constants. -NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ -NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ -NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ -NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ -NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, -# size might increase */ -NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped -# files */ -NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ +NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ +NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ +NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ +NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ +NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ +NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ class Elf64_Nhdr(ctypes.Structure): # typedef struct - _fields_ = [ # { + _fields_ = [ ( "n_namesz", Elf64_Word - ), # Elf64_Word n_namesz; /* Length of the note's name. */ + ), # Elf64_Word n_namesz; /* Length of the note's name. */ ( "n_descsz", Elf64_Word - ), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ + ), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ ("n_type", Elf64_Word - ), # Elf64_Word n_type; /* Type of the note. */ + ), # Elf64_Word n_type; /* Type of the note. */ ] # } Elf64_Nhdr; # Elf64_Shdr related constants. -class Elf64_Shdr(ctypes.Structure): # typedef struct - _fields_ = [ # { +class Elf64_Shdr(ctypes.Structure): + _fields_ = [ ( + # Section name (string tbl index) "sh_name", Elf64_Word - ), # Elf64_Word sh_name; /* Section name (string tbl index) */ - ("sh_type", Elf64_Word - ), # Elf64_Word sh_type; /* Section type */ - ("sh_flags", Elf64_Xword - ), # Elf64_Xword sh_flags; /* Section flags */ + ), + ( + # Section type + "sh_type", Elf64_Word + ), + ( + # Section flags + "sh_flags", Elf64_Xword + ), ( + # Section virtual addr at execution "sh_addr", Elf64_Addr - ), # Elf64_Addr sh_addr; /* Section virtual addr at execution */ + ), ( + # Section file offset "sh_offset", Elf64_Off - ), # Elf64_Off sh_offset; /* Section file offset */ + ), ( + # Section size in bytes "sh_size", Elf64_Xword - ), # Elf64_Xword sh_size; /* Section size in bytes */ + ), ( + # Link to another section "sh_link", Elf64_Word - ), # Elf64_Word sh_link; /* Link to another section */ + ), ( + # Additional section information "sh_info", Elf64_Word - ), # Elf64_Word sh_info; /* Additional section information */ - ("sh_addralign", Elf64_Xword - ), # Elf64_Xword sh_addralign; /* Section alignment */ + ), ( + # Section alignment + "sh_addralign", Elf64_Xword + ), + ( + # Entry size if section holds table "sh_entsize", Elf64_Xword - ) # Elf64_Xword sh_entsize; /* Entry size if section holds table */ - ] # } Elf64_Shdr; + ) + ] # elf_prstatus related constants. @@ -179,188 +187,264 @@ class Elf64_Shdr(ctypes.Structure): # typedef struct # Signal info. class elf_siginfo(ctypes.Structure): # struct elf_siginfo - _fields_ = [ # { - ("si_signo", ctypes.c_int - ), # int si_signo; /* Signal number. */ - ("si_code", ctypes.c_int - ), # int si_code; /* Extra code. */ - ("si_errno", ctypes.c_int - ) # int si_errno; /* Errno. */ - ] # }; + _fields_ = [ + ( + # Signal number + "si_signo", ctypes.c_int + ), + ( + # Extra code + "si_code", ctypes.c_int + ), + ( + # Errno + "si_errno", ctypes.c_int + ) + ] # A time value that is accurate to the nearest # microsecond but also has a range of years. class timeval(ctypes.Structure): # struct timeval - _fields_ = [ # { - ("tv_sec", - ctypes.c_long), # __time_t tv_sec; /* Seconds. */ - ("tv_usec", ctypes.c_long - ) # __suseconds_t tv_usec; /* Microseconds. */ - ] # }; + _fields_ = [ + ( + # __time_t tv_sec; /* Seconds. */ + "tv_sec", ctypes.c_long + ), + ( + # __suseconds_t tv_usec; /* Microseconds. */ + "tv_usec", ctypes.c_long + ) + ] class user_regs_struct(ctypes.Structure): # struct user_regs_struct - _fields_ = [ # { + _fields_ = [ ("r15", - ctypes.c_ulonglong), # __extension__ unsigned long long int r15; + ctypes.c_ulonglong), # __extension__ unsigned long long int r15; ("r14", - ctypes.c_ulonglong), # __extension__ unsigned long long int r14; + ctypes.c_ulonglong), # __extension__ unsigned long long int r14; ("r13", - ctypes.c_ulonglong), # __extension__ unsigned long long int r13; + ctypes.c_ulonglong), # __extension__ unsigned long long int r13; ("r12", - ctypes.c_ulonglong), # __extension__ unsigned long long int r12; + ctypes.c_ulonglong), # __extension__ unsigned long long int r12; ("rbp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; + ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; ("rbx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; ("r11", - ctypes.c_ulonglong), # __extension__ unsigned long long int r11; + ctypes.c_ulonglong), # __extension__ unsigned long long int r11; ("r10", - ctypes.c_ulonglong), # __extension__ unsigned long long int r10; + ctypes.c_ulonglong), # __extension__ unsigned long long int r10; ("r9", - ctypes.c_ulonglong), # __extension__ unsigned long long int r9; + ctypes.c_ulonglong), # __extension__ unsigned long long int r9; ("r8", - ctypes.c_ulonglong), # __extension__ unsigned long long int r8; + ctypes.c_ulonglong), # __extension__ unsigned long long int r8; ("rax", - ctypes.c_ulonglong), # __extension__ unsigned long long int rax; + ctypes.c_ulonglong), # __extension__ unsigned long long int rax; ("rcx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; ("rdx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; ("rsi", - ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; + ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; ("rdi", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; + ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; ("orig_rax", ctypes.c_ulonglong - ), # __extension__ unsigned long long int orig_rax; + ), # __extension__ unsigned long long int orig_rax; ("rip", - ctypes.c_ulonglong), # __extension__ unsigned long long int rip; + ctypes.c_ulonglong), # __extension__ unsigned long long int rip; ("cs", - ctypes.c_ulonglong), # __extension__ unsigned long long int cs; + ctypes.c_ulonglong), # __extension__ unsigned long long int cs; ("eflags", - ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; + ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; ("rsp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; + ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; ("ss", - ctypes.c_ulonglong), # __extension__ unsigned long long int ss; + ctypes.c_ulonglong), # __extension__ unsigned long long int ss; ("fs_base", ctypes.c_ulonglong - ), # __extension__ unsigned long long int fs_base; + ), # __extension__ unsigned long long int fs_base; ("gs_base", ctypes.c_ulonglong - ), # __extension__ unsigned long long int gs_base; + ), # __extension__ unsigned long long int gs_base; ("ds", - ctypes.c_ulonglong), # __extension__ unsigned long long int ds; + ctypes.c_ulonglong), # __extension__ unsigned long long int ds; ("es", - ctypes.c_ulonglong), # __extension__ unsigned long long int es; + ctypes.c_ulonglong), # __extension__ unsigned long long int es; ("fs", - ctypes.c_ulonglong), # __extension__ unsigned long long int fs; + ctypes.c_ulonglong), # __extension__ unsigned long long int fs; ("gs", ctypes.c_ulonglong - ) # __extension__ unsigned long long int gs; - ] # }; + ) # __extension__ unsigned long long int gs; + ] -#elf_greg_t = ctypes.c_ulonglong -#ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) -#elf_gregset_t = elf_greg_t*ELF_NGREG +# elf_greg_t = ctypes.c_ulonglong +# ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) +# elf_gregset_t = elf_greg_t*ELF_NGREG elf_gregset_t = user_regs_struct class elf_prstatus(ctypes.Structure): # struct elf_prstatus - _fields_ = [ # { + _fields_ = [ ( + # Info associated with signal + # struct elf_siginfo pr_info; "pr_info", elf_siginfo - ), # struct elf_siginfo pr_info; /* Info associated with signal. */ - ("pr_cursig", ctypes.c_short - ), # short int pr_cursig; /* Current signal. */ + ), + ( + # Current signal + # short int pr_cursig; + "pr_cursig", ctypes.c_short + ), ( + # Set of pending signals + # unsigned long int pr_sigpend; "pr_sigpend", ctypes.c_ulong - ), # unsigned long int pr_sigpend; /* Set of pending signals. */ + ), ( + # Set of held signals + # unsigned long int pr_sighold; "pr_sighold", ctypes.c_ulong - ), # unsigned long int pr_sighold; /* Set of held signals. */ - ("pr_pid", ctypes.c_int), # __pid_t pr_pid; - ("pr_ppid", ctypes.c_int), # __pid_t pr_ppid; - ("pr_pgrp", ctypes.c_int), # __pid_t pr_pgrp; - ("pr_sid", ctypes.c_int), # __pid_t pr_sid; - ("pr_utime", - timeval), # struct timeval pr_utime; /* User time. */ - ("pr_stime", timeval - ), # struct timeval pr_stime; /* System time. */ - ( + ), + ( + # Process ID + # __pid_t pr_pid; + "pr_pid", ctypes.c_int + ), + ( + # Parent process ID + # __pid_t pr_ppid; + "pr_ppid", ctypes.c_int + ), + ( + # Parent group ID + # __pid_t pr_pgrp; + "pr_pgrp", ctypes.c_int + ), + ( + # Parent session ID + # __pid_t pr_sid; + "pr_sid", ctypes.c_int + ), + ( + # User time + # struct timeval pr_utime; + "pr_utime", timeval + ), + ( + # System time + # struct timeval pr_stime; + "pr_stime", timeval + ), + ( + # Cumulative user time + # struct timeval pr_cutime; "pr_cutime", timeval - ), # struct timeval pr_cutime; /* Cumulative user time. */ + ), ( + # Cumulative system time + # struct timeval pr_cstime; "pr_cstime", timeval - ), # struct timeval pr_cstime; /* Cumulative system time. */ - ("pr_reg", elf_gregset_t - ), # elf_gregset_t pr_reg; /* GP registers. */ + ), ( + # GP registers + # elf_gregset_t pr_reg; + "pr_reg", elf_gregset_t + ), + ( + # True if math copro being used + # int pr_fpvalid; "pr_fpvalid", ctypes.c_int - ) # int pr_fpvalid; /* True if math copro being used. */ - ] # }; + ) + ] # elf_prpsinfo related constants. -ELF_PRARGSZ = 80 # #define ELF_PRARGSZ (80) /* Number of chars for args. */ +# Number of chars for args +# #define ELF_PRARGSZ (80) +ELF_PRARGSZ = 80 class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo - _fields_ = [ # { + _fields_ = [ ( + # Numeric process state + # char pr_state; "pr_state", ctypes.c_byte - ), # char pr_state; /* Numeric process state. */ + ), ( + # Char for pr_state + # char pr_sname; "pr_sname", ctypes.c_char - ), # char pr_sname; /* Char for pr_state. */ - ("pr_zomb", ctypes.c_byte - ), # char pr_zomb; /* Zombie. */ - ("pr_nice", ctypes.c_byte - ), # char pr_nice; /* Nice val. */ - ("pr_flag", ctypes.c_ulong - ), # unsigned long int pr_flag; /* Flags. */ - # #if __WORDSIZE == 32 - # unsigned short int pr_uid; - # unsigned short int pr_gid; - # #else - ("pr_uid", ctypes.c_uint), # unsigned int pr_uid; - ("pr_gid", ctypes.c_uint), # unsigned int pr_gid; - # #endif - ("pr_pid", ctypes.c_int), # int pr_pid, pr_ppid, pr_pgrp, pr_sid; + ), + ( + # Zombie + # char pr_zomb; + "pr_zomb", ctypes.c_byte + ), + ( + # Nice value + # char pr_nice; + "pr_nice", ctypes.c_byte + ), + ( + # Flags + # unsigned long int pr_flag; + "pr_flag", ctypes.c_ulong + ), + ( + # User ID + # unsigned int pr_uid; + "pr_uid", ctypes.c_uint + ), + ( + # Group ID + # unsigned int pr_gid; + "pr_gid", ctypes.c_uint + ), + ("pr_pid", ctypes.c_int), ("pr_ppid", ctypes.c_int), ("pr_pgrp", ctypes.c_int), ("pr_sid", ctypes.c_int), - # /* Lots missing */ + # /* Lots missing */ ( + # Filename of executable + # char pr_fname[16]; "pr_fname", ctypes.c_char * 16 - ), # char pr_fname[16]; /* Filename of executable. */ + ), ( + # Initial part of arg list + # char pr_psargs[ELF_PRARGSZ]; "pr_psargs", ctypes.c_char * ELF_PRARGSZ - ) # char pr_psargs[ELF_PRARGSZ]; /* Initial part of arg list. */ - ] # }; + ) + ] class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct - _fields_ = [ # { - ("cwd", ctypes.c_ushort), # unsigned short int cwd; - ("swd", ctypes.c_ushort), # unsigned short int swd; - ("ftw", ctypes.c_ushort), # unsigned short int ftw; - ("fop", ctypes.c_ushort), # unsigned short int fop; - ("rip", - ctypes.c_ulonglong), # __extension__ unsigned long long int rip; - ("rdp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdp; - ("mxcsr", ctypes.c_uint), # unsigned int mxcsr; - ("mxcr_mask", ctypes.c_uint), # unsigned int mxcr_mask; - ( - "st_space", ctypes.c_uint * 32 - ), # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - ( - "xmm_space", ctypes.c_uint * 64 - ), # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ - ("padding", - ctypes.c_uint * 24), # unsigned int padding[24]; - ] # }; + _fields_ = [ + # unsigned short int cwd; + ("cwd", ctypes.c_ushort), + # unsigned short int swd; + ("swd", ctypes.c_ushort), + # unsigned short int ftw; + ("ftw", ctypes.c_ushort), + # unsigned short int fop; + ("fop", ctypes.c_ushort), + # __extension__ unsigned long long int rip; + ("rip", ctypes.c_ulonglong), + # __extension__ unsigned long long int rdp; + ("rdp", ctypes.c_ulonglong), + # unsigned int mxcsr; + ("mxcsr", ctypes.c_uint), + # unsigned int mxcr_mask; + ("mxcr_mask", ctypes.c_uint), + # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + ("st_space", ctypes.c_uint * 32), + # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + ("xmm_space", ctypes.c_uint * 64), + # unsigned int padding[24]; + ("padding", ctypes.c_uint * 24), + ] elf_fpregset_t = user_fpregs_struct @@ -368,318 +452,393 @@ class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct # siginfo_t related constants. _SI_MAX_SIZE = 128 -_SI_PAD_SIZE = (_SI_MAX_SIZE / ctypes.sizeof(ctypes.c_int)) - 4 +_SI_PAD_SIZE = (_SI_MAX_SIZE // ctypes.sizeof(ctypes.c_int)) - 4 -# /* kill(). */ -class _siginfo_t_U_kill(ctypes.Structure): # struct - _fields_ = [ # { - ("si_pid", ctypes.c_int - ), # __pid_t si_pid; /* Sending process ID. */ +# /* kill(). */ +class _siginfo_t_U_kill(ctypes.Structure): # struct + _fields_ = [ + ( + # Sending process ID + # __pid_t si_pid; + "si_pid", ctypes.c_int + ), ( + # Real user ID of sending process + # __uid_t si_uid; "si_uid", ctypes.c_uint - ) # __uid_t si_uid; /* Real user ID of sending process. */ - ] # } _kill; + ) + ] # } _kill; # Type for data associated with a signal. class sigval_t(ctypes.Union): # typedef union sigval - _fields_ = [ # { - ("sival_int", ctypes.c_int), # int sival_int; - ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; - ] # } sigval_t; - - - # /* POSIX.1b timers. */ -class _siginfo_t_U_timer(ctypes.Structure): # struct - _fields_ = [ # { - ("si_tid", - ctypes.c_int), # int si_tid; /* Timer ID. */ - ("si_overrun", ctypes.c_int - ), # int si_overrun; /* Overrun count. */ - ("si_sigval", sigval_t - ) # sigval_t si_sigval; /* Signal value. */ - ] # } _timer; - - - # /* POSIX.1b signals. */ -class _siginfo_t_U_rt(ctypes.Structure): # struct - _fields_ = [ # { - ("si_pid", ctypes.c_int - ), # __pid_t si_pid; /* Sending process ID. */ + _fields_ = [ + ("sival_int", ctypes.c_int), # int sival_int; + ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; + ] # } sigval_t; + + +# /* POSIX.1b timers. */ +class _siginfo_t_U_timer(ctypes.Structure): # struct + _fields_ = [ + ( + # Timer ID + # int si_tid; + "si_tid", ctypes.c_int + ), + ( + # Overrun count + # int si_overrun; + "si_overrun", ctypes.c_int + ), ( + # Signal value + # sigval_t si_sigval; + "si_sigval", sigval_t + ) + ] # } _timer; + + +# /* POSIX.1b signals. */ +class _siginfo_t_U_rt(ctypes.Structure): # struct + _fields_ = [ + ( + # Sending process ID + # __pid_t si_pid; + "si_pid", ctypes.c_int + ), + ( + # Real user ID of sending process + # __uid_t si_uid; "si_uid", ctypes.c_uint - ), # __uid_t si_uid; /* Real user ID of sending process. */ - ("si_sigval", sigval_t - ) # sigval_t si_sigval; /* Signal value. */ - ] # } _rt; + ), + ( + # Signal value + # sigval_t si_sigval; + "si_sigval", sigval_t + ) + ] # } _rt; - # /* SIGCHLD. */ -class _siginfo_t_U_sigchld(ctypes.Structure): # struct - _fields_ = [ # { - ("si_pid", - ctypes.c_int), # __pid_t si_pid; /* Which child. */ +# /* SIGCHLD. */ +class _siginfo_t_U_sigchld(ctypes.Structure): # struct + _fields_ = [ + ( + # Which child + # __pid_t si_pid; + "si_pid", ctypes.c_int + ), ( + # Real user ID of sending process + # __uid_t si_uid; "si_uid", ctypes.c_uint - ), # __uid_t si_uid; /* Real user ID of sending process. */ - ("si_status", ctypes.c_int - ), # int si_status; /* Exit value or signal. */ - ("si_utime", ctypes.c_long), # __sigchld_clock_t si_utime; - ("si_stime", ctypes.c_long) # __sigchld_clock_t si_stime; - ] # } _sigchld; + ), + ( + # Exit value or signal + # int si_status; + "si_status", ctypes.c_int + ), + ( + # __sigchld_clock_t si_utime; + "si_utime", ctypes.c_long + ), + ( + # __sigchld_clock_t si_stime; + "si_stime", ctypes.c_long + ) + ] # } _sigchld; - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ -class _siginfo_t_U_sigfault(ctypes.Structure): # struct - _fields_ = [ # { - ("si_addr", ctypes.c_void_p - ), # void *si_addr; /* Faulting insn/memory ref. */ +# /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ +class _siginfo_t_U_sigfault(ctypes.Structure): # struct + _fields_ = [ ( + # Faulting insn/memory ref + # void *si_addr; + "si_addr", ctypes.c_void_p + ), + ( + # Valid LSB of the reported address + # short int si_addr_lsb; "si_addr_lsb", ctypes.c_short - ) # short int si_addr_lsb; /* Valid LSB of the reported address. */ - ] # } _sigfault; + ) + ] # } _sigfault; - # /* SIGPOLL. */ -class _siginfo_t_U_sigpoll(ctypes.Structure): # struct - _fields_ = [ # { - ("si_band", ctypes.c_long - ), # long int si_band; /* Band event for SIGPOLL. */ - ("si_fd", ctypes.c_int) # int si_fd; - ] # } _sigpoll; +# /* SIGPOLL. */ +class _siginfo_t_U_sigpoll(ctypes.Structure): # struct + _fields_ = [ + ( + # Band event for SIGPOLL + # long int si_band; + "si_band", ctypes.c_long + ), + ( + # int si_fd; + "si_fd", ctypes.c_int + ) + ] # } _sigpoll; - # /* SIGSYS. */ -class _siginfo_t_U_sigsys(ctypes.Structure): # struct - _fields_ = [ # { +# /* SIGSYS. */ +class _siginfo_t_U_sigsys(ctypes.Structure): # struct + _fields_ = [ ("_call_addr", ctypes.c_void_p - ), # void *_call_addr; /* Calling user insn. */ + ), # void *_call_addr; /* Calling user insn. */ ( "_syscall", ctypes.c_int - ), # int _syscall; /* Triggering system call number. */ + ), # int _syscall; /* Triggering system call number. */ ("_arch", ctypes.c_uint - ) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - ] # } _sigsys; + ) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + ] # } _sigsys; -class _siginfo_t_U(ctypes.Union): # union - _fields_ = [ # { +class _siginfo_t_U(ctypes.Union): # union + _fields_ = [ ("_pad", - ctypes.c_int * _SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; - # - # /* kill(). */ - ("_kill", _siginfo_t_U_kill), # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # } _kill; - # - # /* POSIX.1b timers. */ - ("_timer", _siginfo_t_U_timer), # struct - # { - # int si_tid; /* Timer ID. */ - # int si_overrun; /* Overrun count. */ - # sigval_t si_sigval; /* Signal value. */ - # } _timer; - # - # /* POSIX.1b signals. */ - ("_rt", _siginfo_t_U_rt), # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # sigval_t si_sigval; /* Signal value. */ - # } _rt; - # - # /* SIGCHLD. */ - ("_sigchld", _siginfo_t_U_sigchld), # struct - # { - # __pid_t si_pid; /* Which child. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # int si_status; /* Exit value or signal. */ - # __sigchld_clock_t si_utime; - # __sigchld_clock_t si_stime; - # } _sigchld; - # - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ - ("_sigfault", _siginfo_t_U_sigfault), # struct - # { - # void *si_addr; /* Faulting insn/memory ref. */ - # short int si_addr_lsb; /* Valid LSB of the reported address. */ - # } _sigfault; - # - # /* SIGPOLL. */ - ("_sigpoll", _siginfo_t_U_sigpoll), # struct - # { - # long int si_band; /* Band event for SIGPOLL. */ - # int si_fd; - # } _sigpoll; - # - # /* SIGSYS. */ - ("_sigsys", _siginfo_t_U_sigpoll) # struct - # { - # void *_call_addr; /* Calling user insn. */ - # int _syscall; /* Triggering system call number. */ - # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - # } _sigsys; - ] # } _sifields; + ctypes.c_int * _SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; + + # /* kill(). */ + ("_kill", _siginfo_t_U_kill), # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; + + # /* POSIX.1b timers. */ + ("_timer", _siginfo_t_U_timer), # struct + + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; + + # /* POSIX.1b signals. */ + ("_rt", _siginfo_t_U_rt), # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; + + # /* SIGCHLD. */ + ("_sigchld", _siginfo_t_U_sigchld), # struct + + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; + + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + ("_sigfault", _siginfo_t_U_sigfault), # struct + + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; + + # /* SIGPOLL. */ + ("_sigpoll", _siginfo_t_U_sigpoll), # struct + + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; + + # /* SIGSYS. */ + ("_sigsys", _siginfo_t_U_sigpoll) # struct + + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + ] # } _sifields; class siginfo_t(ctypes.Structure): # typedef struct - _fields_ = [ # { - ("si_signo", ctypes.c_int - ), # int si_signo; /* Signal number. */ + _fields_ = [ + ( + # Signal number + # int si_signo; + "si_signo", ctypes.c_int + ), ( + # If non-zero, an errno value associated with + # int si_errno; "si_errno", ctypes.c_int - ), # int si_errno; /* If non-zero, an errno value associated with - # this signal, as defined in . */ - ("si_code", ctypes.c_int - ), # int si_code; /* Signal code. */ - # - ("_sifields", _siginfo_t_U) # union - # { - # int _pad[__SI_PAD_SIZE]; + ), + ( + # Signal code - this signal, as defined in + # int si_code; + "si_code", ctypes.c_int + ), + ( + # Union + "_sifields", _siginfo_t_U + ) + + # int _pad[__SI_PAD_SIZE]; # - # /* kill(). */ - # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # } _kill; + # /* kill(). */ + # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; # - # /* POSIX.1b timers. */ - # struct - # { - # int si_tid; /* Timer ID. */ - # int si_overrun; /* Overrun count. */ - # sigval_t si_sigval; /* Signal value. */ - # } _timer; + # /* POSIX.1b timers. */ + # struct + + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; # - # /* POSIX.1b signals. */ - # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # sigval_t si_sigval; /* Signal value. */ - # } _rt; + # /* POSIX.1b signals. */ + # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; # - # /* SIGCHLD. */ - # struct - # { - # __pid_t si_pid; /* Which child. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # int si_status; /* Exit value or signal. */ - # __sigchld_clock_t si_utime; - # __sigchld_clock_t si_stime; - # } _sigchld; + # /* SIGCHLD. */ + # struct + + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; # - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ - # struct - # { - # void *si_addr; /* Faulting insn/memory ref. */ - # short int si_addr_lsb; /* Valid LSB of the reported address. */ - # } _sigfault; + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + # struct + + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; # - # /* SIGPOLL. */ - # struct - # { - # long int si_band; /* Band event for SIGPOLL. */ - # int si_fd; - # } _sigpoll; + # /* SIGPOLL. */ + # struct + + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; # - # /* SIGSYS. */ - # struct - # { - # void *_call_addr; /* Calling user insn. */ - # int _syscall; /* Triggering system call number. */ - # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - # } _sigsys; - # } _sifields; - ] # } siginfo_t __SI_ALIGNMENT; + # /* SIGSYS. */ + # struct + + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + # } _sifields; + ] # } siginfo_t __SI_ALIGNMENT; # xsave related. class ymmh_struct(ctypes.Structure): # struct ymmh_struct { - _fields_ = [("ymmh_space", 64 * ctypes.c_uint - ) # u32 ymmh_space[64]; - ] # } __packed; + _fields_ = [ + # u32 ymmh_space[64]; + ("ymmh_space", 64 * ctypes.c_uint) + ] # } __packed; class xsave_hdr_struct(ctypes.Structure): # struct xsave_hdr_struct { _fields_ = [ - ("xstate_bv", ctypes.c_ulonglong - ), # u64 xstate_bv; - ("reserved1", ctypes.c_ulonglong * - 2), # u64 reserved1[2]; - ("reserved2", ctypes.c_ulonglong * 5 - ) # u64 reserved2[5]; + # u64 xstate_bv; + ("xstate_bv", ctypes.c_ulonglong), + # u64 reserved1[2]; + ("reserved1", ctypes.c_ulonglong * 2), + # u64 reserved2[5]; + ("reserved2", ctypes.c_ulonglong * 5) ] # } __packed; class i387_fxsave_struct(ctypes.Structure): # struct i387_fxsave_struct { _fields_ = [ ( + # Control Word + # u16 cwd; "cwd", ctypes.c_ushort - ), # u16 cwd; /* Control Word */ + ), ( + # Status Word + # u16 swd; "swd", ctypes.c_ushort - ), # u16 swd; /* Status Word */ + ), ( + # Tag Word + # u16 twd; "twd", ctypes.c_ushort - ), # u16 twd; /* Tag Word */ + ), ( + # Last Instruction Opcode + # u16 fop; "fop", ctypes.c_ushort - ), # u16 fop; /* Last Instruction Opcode */ - # union { - # struct { + ), + # union { + # struct { ( + # Instruction Pointer + # u64 rip; "rip", ctypes.c_ulonglong - ), # u64 rip; /* Instruction Pointer */ + ), ( + # Data Pointer + # u64 rdp; "rdp", ctypes.c_ulonglong - ), # u64 rdp; /* Data Pointer */ - # }; - # struct { - # u32 fip; /* FPU IP Offset */ - # u32 fcs; /* FPU IP Selector */ - # u32 foo; /* FPU Operand Offset */ - # u32 fos; /* FPU Operand Selector */ - # }; - # }; + ), + + # struct { + # u32 fip; /* FPU IP Offset */ + # u32 fcs; /* FPU IP Selector */ + # u32 foo; /* FPU Operand Offset */ + # u32 fos; /* FPU Operand Selector */ + ( + # MXCSR Register State + # u32 mxcsr; "mxcsr", ctypes.c_uint - ), # u32 mxcsr; /* MXCSR Register State */ + ), ( + # MXCSR Mask + # u32 mxcsr_mask; "mxcsr_mask", ctypes.c_uint - ), # u32 mxcsr_mask; /* MXCSR Mask */ - # - # /* 8*16 bytes for each FP-reg = 128 bytes */ - ("st_space", ctypes.c_uint * 32 - ), # u32 st_space[32]; - # - # /* 16*16 bytes for each XMM-reg = 256 bytes */ - ("xmm_space", ctypes.c_uint * 64 - ), # u32 xmm_space[64]; - # - ("padding", ctypes.c_uint * 12 - ), # u32 padding[12]; - # - # union { - ("padding1", ctypes.c_uint * 12 - ) # u32 padding1[12]; - # u32 sw_reserved[12]; - # }; - # + ), + # 8*16 bytes for each FP-reg = 128 bytes + ( + # u32 st_space[32]; + "st_space", ctypes.c_uint * 32 + ), + # 16*16 bytes for each XMM-reg = 256 bytes + ( + # u32 xmm_space[64]; + "xmm_space", ctypes.c_uint * 64 + ), + ( + # u32 padding[12]; + "padding", ctypes.c_uint * 12 + ), + # union { + ( + # u32 padding1[12]; + "padding1", ctypes.c_uint * 12 + ) + # u32 sw_reserved[12]; ] # } __aligned(16); class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { _fields_ = [ - ("i387", - i387_fxsave_struct), # struct i387_fxsave_struct i387; - ("xsave_hdr", xsave_hdr_struct - ), # struct xsave_hdr_struct xsave_hdr; - ("ymmh", ymmh_struct) # struct ymmh_struct ymmh; + # struct i387_fxsave_struct i387; + ("i387", i387_fxsave_struct), + # struct xsave_hdr_struct xsave_hdr; + ("xsave_hdr", xsave_hdr_struct), + # struct ymmh_struct ymmh; + ("ymmh", ymmh_struct) ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 50a2fa9c55..03b64585c1 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -31,6 +31,7 @@ obj-y += fsnotify.o obj-y += image-desc.o obj-y += image.o obj-y += img-streamer.o +obj-y += io_uring.o obj-y += ipc_ns.o obj-y += irmap.o obj-y += kcmp-ids.o diff --git a/criu/apparmor.c b/criu/apparmor.c index 328fc606bb..f9ad796195 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -630,7 +630,7 @@ int suspend_aa(void) } ret = do_suspend(true); - if (rm_rf(policydir) < 0) + if (rmrf(policydir) < 0) pr_err("failed removing policy dir %s\n", policydir); return ret; diff --git a/criu/arch/ppc64/restorer.c b/criu/arch/ppc64/restorer.c index c17ba16699..56c09391e7 100644 --- a/criu/arch/ppc64/restorer.c +++ b/criu/arch/ppc64/restorer.c @@ -45,10 +45,10 @@ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) unsigned long raddr; int ret; - ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ - shmflg, /* second */ + ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ + shmflg, /* second */ (unsigned long)&raddr, /* third */ - shmaddr, /* ptr */ + shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) diff --git a/criu/arch/ppc64/vdso-pie.c b/criu/arch/ppc64/vdso-pie.c index f01123efee..a84ae776bb 100644 --- a/criu/arch/ppc64/vdso-pie.c +++ b/criu/arch/ppc64/vdso-pie.c @@ -110,9 +110,9 @@ static inline void put_trampoline_call(unsigned long at, unsigned long to, unsig { uint32_t *addr = (uint32_t *)at; - *addr++ = 0x7C0802a6; /* mflr r0 */ + *addr++ = 0x7C0802a6; /* mflr r0 */ *addr++ = 0x48000001 | ((long)(tr - at - 4) & 0x3fffffc); /* bl tr */ - *(uint64_t *)addr = to; /* the address to read by the trampoline */ + *(uint64_t *)addr = to; /* the address to read by the trampoline */ invalidate_caches(at); } diff --git a/criu/arch/s390/restorer.c b/criu/arch/s390/restorer.c index 6907ad75bf..8b3bc44baf 100644 --- a/criu/arch/s390/restorer.c +++ b/criu/arch/s390/restorer.c @@ -23,10 +23,10 @@ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) unsigned long raddr; int ret; - ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ - shmflg, /* second */ + ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ + shmflg, /* second */ (unsigned long)&raddr, /* third */ - shmaddr, /* ptr */ + shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) diff --git a/criu/arch/s390/vdso-pie.c b/criu/arch/s390/vdso-pie.c index ad504beda0..bf0366b0e4 100644 --- a/criu/arch/s390/vdso-pie.c +++ b/criu/arch/s390/vdso-pie.c @@ -18,9 +18,9 @@ */ typedef struct { u8 larl[6]; /* Load relative address of imm64 */ - u8 lg[6]; /* Load %r1 with imm64 */ - u8 br[2]; /* Branch to %r1 */ - u64 addr; /* Jump address */ + u8 lg[6]; /* Load %r1 with imm64 */ + u8 br[2]; /* Branch to %r1 */ + u64 addr; /* Jump address */ u32 guards; /* Guard bytes */ } __packed jmp_t; diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index d02f4abd5b..b3a7ca6365 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -107,64 +107,103 @@ int cpu_dump_cpuinfo(void) #define __ins_bit(__l, __v) (1u << ((__v)-32u * (__l))) +// clang-format off static uint32_t x86_ins_capability_mask[NCAPINTS] = { - [CPUID_1_EDX] = __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), - - [CPUID_8000_0001_EDX] = __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), - - [CPUID_LNX_1] = __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), - - [CPUID_1_ECX] = __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), + [CPUID_1_EDX] = + __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), + + [CPUID_8000_0001_EDX] = + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), + + [CPUID_LNX_1] = + __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | + __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), + + [CPUID_1_ECX] = + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), [CPUID_8000_0001_ECX] = - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), [CPUID_7_0_EBX] = - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), - - [CPUID_D_1_EAX] = __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | - __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), + + [CPUID_D_1_EAX] = + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), [CPUID_7_0_ECX] = - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), - - [CPUID_8000_0008_EBX] = __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), - - [CPUID_7_0_EDX] = __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | - __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), + + [CPUID_8000_0008_EBX] = + __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), + + [CPUID_7_0_EDX] = + __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | + __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), }; +// clang-format on #undef __ins_bit diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index 23438314f8..f7a6d50589 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -13,7 +13,7 @@ extern void restore_tls(tls_t *ptls); extern int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act); extern int set_compat_robust_list(uint32_t head_ptr, uint32_t len); -#else /* CONFIG_COMPAT */ +#else /* CONFIG_COMPAT */ static inline void restore_tls(tls_t *ptls) { } diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c index f02b2cc0e4..506a8d1bb1 100644 --- a/criu/arch/x86/sigaction_compat.c +++ b/criu/arch/x86/sigaction_compat.c @@ -44,8 +44,8 @@ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) memcpy(stack32, act, sizeof(rt_sigaction_t_compat)); arg.nr = __NR32_rt_sigaction; arg.arg0 = sig; - arg.arg1 = (uint32_t)act_stack; /* act */ - arg.arg2 = 0; /* oldact */ + arg.arg1 = (uint32_t)act_stack; /* act */ + arg.arg2 = 0; /* oldact */ arg.arg3 = (uint32_t)sizeof(act->rt_sa_mask); /* sigsetsize */ return do_full_int80(&arg); diff --git a/criu/bpf-util.c b/criu/bpf-util.c new file mode 100644 index 0000000000..3652027fbf --- /dev/null +++ b/criu/bpf-util.c @@ -0,0 +1,259 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "xmalloc.h" +#include "bpf-util.h" +#include "bpf_insn.h" +#include "common/bug.h" + +/* XXX: Propagate the case of errors from bpf_map_update_elem */ + +static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, unsigned int size) +{ + return syscall(__NR_bpf, cmd, attr, size); +} + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static int bpf_map_create(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); +} + +static int bpf_prog_load_iter(struct bpf_insn *insns, int insn_cnt) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_TRACING; + attr.expected_attach_type = BPF_TRACE_ITER; + + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insn_cnt; + + return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); +} + +struct bpf_insn_buf { + int insn_cap; + int insn_cnt; + struct bpf_insn insns[]; +}; + +static struct bpf_insn_buf *bpf_insn_buf_alloc(void) +{ + struct bpf_insn_buf *ibuf; + + ibuf = xmalloc(offsetof(struct bpf_insn_buf, insns[64])); + if (!ibuf) + return NULL; + ibuf->insn_cap = 64; + ibuf->insn_cnt = 0; + return ibuf; +} + +static void bpf_insn_buf_free(struct bpf_insn_buf *ibuf) +{ + xfree(ibuf); +} + +static int bpf_insn_buf_push(struct bpf_insn_buf *ibuf, struct bpf_insn *insns, int insn_cnt) +{ + BUG_ON(!ibuf); + if (ibuf->insn_cap >= ibuf->insn_cnt + insn_cnt) + goto push; + ibuf = xrealloc(ibuf, offsetof(struct bpf_insn_buf, insns[ibuf->insn_cap + insn_cnt])); + if (!ibuf) + return -ENOMEM; +push: + memcpy(ibuf->insns + ibuf->insn_cnt, insns, insn_cnt * sizeof(*insns)); + ibuf->insn_cnt += insn_cnt; + ibuf->insn_cap += insn_cnt; + return 0; +} + +#define bpf_push(insn) \ + ({ \ + if ((ret = bpf_insn_buf_push(ibuf, (struct bpf_insn[]){ insn }, \ + sizeof((struct bpf_insn[]){ insn }) / sizeof(struct bpf_insn)))) \ + goto exit; \ + }) + +typedef int bpf_insn_buf_fill_cb(struct bpf_fdtable *meta, struct bpf_insn_buf *ibuf, void *userdata); + +enum fill_type { + FILL_TASK_FILE, + FILL_IO_URING, + FILL_EPOLL, +}; + +static int bpf_fill_fdtable(enum fill_type type, int *fill_desc, struct bpf_fdtable *meta, + int index_size, int max_entries, bpf_insn_buf_fill_cb fill_insn, + void *userdata) +{ + int file2index_map_fd, index2file_map_fd, ret; + struct bpf_insn_buf *ibuf; + + BUG_ON(!meta); + BUG_ON(index_size != 4 || index_size != 8); + + file2index_map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, sizeof(uint64_t), index_size, max_entries); + if (file2index_map_fd < 0) + return -errno; + + index2file_map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, index_size, sizeof(uint64_t), max_entries); + if (index2file_map_fd < 0) { + ret = -errno; + goto end_file2fd; + } + + meta->file2index_map_fd = file2index_map_fd; + meta->index2file_map_fd = index2file_map_fd; + + ibuf = bpf_insn_buf_alloc(); + if (!ibuf) { + ret = -ENOMEM; + goto end_fd2file; + } + + if ((ret = fill_insn(meta, ibuf, userdata))) + goto end_ibuf; + + ret = bpf_prog_load_iter(ibuf->insns, ibuf->insn_cnt); + if (ret < 0) + ret = -errno; + bpf_insn_buf_free(ibuf); + + return ret; +end_ibuf: + bpf_insn_buf_free(ibuf); +end_fd2file: + close(index2file_map_fd); +end_file2fd: + close(file2index_map_fd); + return ret; +} + +static int task_fill_cb(struct bpf_fdtable *meta, struct bpf_insn_buf *ibuf, void *userdata) +{ + int tgid = *(int *)userdata, ret; + + (void)tgid; + /* XXX: Fixup task_struct::tgid offset and compare (requires libbpf dep) */ + bpf_push(BPF_MOV64_REG(BPF_REG_6, BPF_REG_1)); + /* index -> file */ + bpf_push(BPF_LD_MAP_FD(BPF_REG_1, meta->index2file_map_fd)); + bpf_push(BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 /* meta + */)); + bpf_push(BPF_MOV64_REG(BPF_REG_3, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8 + 8 + 8 /* meta + task + fd */)); + bpf_push(BPF_MOV64_IMM(BPF_REG_4, 0)); + bpf_push(BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem)); + /* file -> index */ + bpf_push(BPF_MOV64_REG(BPF_REG_1, meta->file2index_map_fd)); + bpf_push(BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 /* meta + ctx */)); + bpf_push(BPF_MOV64_REG(BPF_REG_3, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 + 8 /* meta + ctx + file */)); + bpf_push(BPF_MOV64_IMM(BPF_REG_4, 0)); + bpf_push(BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem)); + bpf_push(BPF_MOV64_IMM(BPF_REG_0, 0)); + bpf_push(BPF_EXIT_INSN()); + return 0; +exit: + return ret; +} + +int bpf_fill_task_fdtable(int tgid, struct bpf_fdtable *meta) +{ + return bpf_fill_fdtable(FILL_TASK_FILE, &tgid, meta, sizeof(int), 65535, task_fill_cb, + &tgid); +} + +static int io_uring_fill_cb(struct bpf_fdtable *meta, struct bpf_insn_buf *ibuf, void *userdata) +{ + int ret; + + /* XXX: Consider skipping in sparse set */ + bpf_push(BPF_MOV64_REG(BPF_REG_6, BPF_REG_1)); + /* index -> file */ + bpf_push(BPF_LD_MAP_FD(BPF_REG_1, meta->index2file_map_fd)); + bpf_push(BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 + 8 /* meta + ctx + file */)); + bpf_push(BPF_MOV64_REG(BPF_REG_3, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8 + 8 /* meta + ctx */)); + bpf_push(BPF_MOV64_IMM(BPF_REG_4, 0)); + bpf_push(BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem)); + /* file -> index */ + bpf_push(BPF_MOV64_REG(BPF_REG_1, meta->file2index_map_fd)); + bpf_push(BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 /* meta + ctx */)); + bpf_push(BPF_MOV64_REG(BPF_REG_3, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 + 8 /* meta + ctx + file */)); + bpf_push(BPF_MOV64_IMM(BPF_REG_4, 0)); + bpf_push(BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem)); + bpf_push(BPF_MOV64_IMM(BPF_REG_0, 0)); + bpf_push(BPF_EXIT_INSN()); + return 0; +exit: + return ret; +} + +int bpf_fill_io_uring_fdtable(int io_uring_fd, struct bpf_fdtable *meta) +{ + return bpf_fill_fdtable(FILL_IO_URING, &io_uring_fd, meta, sizeof(unsigned long), + 4096, io_uring_fill_cb, NULL); +} + +int epoll_fill_cb(struct bpf_fdtable *meta, struct bpf_insn_buf *ibuf, void *userdata) +{ + int ret; + + /* XXX: Relocate epitem offsets */ + bpf_push(BPF_MOV64_REG(BPF_REG_6, BPF_REG_1)); + /* index -> file */ + bpf_push(BPF_LD_MAP_FD(BPF_REG_1, meta->index2file_map_fd)); + bpf_push(BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 + 8 /* meta + ctx + file */)); + bpf_push(BPF_MOV64_REG(BPF_REG_3, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8 + 8 /* meta + ctx */)); + bpf_push(BPF_MOV64_IMM(BPF_REG_4, 0)); + bpf_push(BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem)); + /* file -> index */ + bpf_push(BPF_MOV64_REG(BPF_REG_1, meta->file2index_map_fd)); + bpf_push(BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 /* meta + ctx */)); + bpf_push(BPF_MOV64_REG(BPF_REG_3, BPF_REG_6)); + bpf_push(BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8 + 8 + 8 /* meta + ctx + file */)); + bpf_push(BPF_MOV64_IMM(BPF_REG_4, 0)); + bpf_push(BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem)); + bpf_push(BPF_MOV64_IMM(BPF_REG_0, 0)); + bpf_push(BPF_EXIT_INSN()); + return 0; +exit: + return ret; +} + +int bpf_fill_epoll_fdtable(int epoll_fd, struct bpf_fdtable *meta) +{ + return bpf_fill_fdtable(FILL_EPOLL, &epoll_fd, meta, sizeof(unsigned long), + 4096, io_uring_fill_cb, NULL); +} diff --git a/criu/cgroup.c b/criu/cgroup.c index ccac37fcc5..82d9b16a2e 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -81,7 +81,7 @@ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what) if (l2->next != ctls) c2 = list_first_entry(l2, struct cg_ctl, l); - if (!c1 || !c2) /* Nowhere to move next */ + if (!c1 || !c2) /* Nowhere to move next */ return !c1 && !c2; /* Both lists scanned -- match */ if (strcmp(c1->name, c2->name)) @@ -860,7 +860,7 @@ static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ** cde->dir_perms->gid = cur->gid; cde->dir_name = cur->path + poff; - if (poff != 1) /* parent isn't "/" */ + if (poff != 1) /* parent isn't "/" */ cde->dir_name++; /* leading / */ cde->n_children = cur->n_children; if (cur->n_children > 0) diff --git a/criu/config.c b/criu/config.c index 91fb0b64d4..33f2820a18 100644 --- a/criu/config.c +++ b/criu/config.c @@ -229,7 +229,7 @@ int parse_statement(int i, char *line, char **configuration) tmp_string[0] = 0; /* Check for unsupported configuration file entries */ - if (configuration[i] + offset + 1 != 0 && strchr(configuration[i] + offset, ' ')) { + if (strchr(configuration[i] + offset, ' ')) { int j; len = strlen(configuration[i] + offset); for (j = 0; j < len - 1; j++) { diff --git a/criu/cr-check.c b/criu/cr-check.c index 3575fb3b36..0320b445aa 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -575,6 +575,7 @@ static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) if (read(sk, &c, 1) != 1) { close(sk); kill(pid, SIGKILL); + waitpid(pid, NULL, 0); pr_perror("read"); return -1; } @@ -584,6 +585,7 @@ static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { pr_perror("Unable to ptrace the child"); kill(pid, SIGKILL); + waitpid(pid, NULL, 0); return -1; } @@ -618,6 +620,7 @@ static int check_ptrace_peeksiginfo(void) } kill(pid, SIGKILL); + waitpid(pid, NULL, 0); return ret; } @@ -768,6 +771,7 @@ static int check_special_mapping_mremap(void) /* Probably, we're interrupted with a signal - cleanup */ pr_err("Failed to wait for a child %d\n", errno); kill(child, SIGKILL); + waitpid(child, NULL, 0); return -1; } @@ -806,6 +810,7 @@ static int check_ptrace_suspend_seccomp(void) } kill(pid, SIGKILL); + waitpid(pid, NULL, 0); return ret; } @@ -846,6 +851,7 @@ static int check_ptrace_dump_seccomp_filters(void) } kill(pid, SIGKILL); + waitpid(pid, NULL, 0); return ret; } @@ -1372,6 +1378,14 @@ static int check_network_lock_nftables(void) return 0; } +static int check_sockopt_buf_lock(void) +{ + if (!kdat.has_sockopt_buf_lock) + return -1; + + return 0; +} + static int (*chk_feature)(void); /* @@ -1490,6 +1504,7 @@ int cr_check(void) ret |= check_ns_pid(); ret |= check_apparmor_stacking(); ret |= check_network_lock_nftables(); + ret |= check_sockopt_buf_lock(); } /* @@ -1602,6 +1617,7 @@ static struct feature_list feature_list[] = { { "ns_pid", check_ns_pid }, { "apparmor_stacking", check_apparmor_stacking }, { "network_lock_nftables", check_network_lock_nftables }, + { "sockopt_buf_lock", check_sockopt_buf_lock }, { NULL, NULL }, }; @@ -1621,7 +1637,7 @@ void pr_check_features(const char *offset, const char *sep, int width) } pr_msg("%s", fl->name); // no \n pos += len; - if ((fl + 1)->name) { // not the last item + if ((fl + 1)->name) { // not the last item pr_msg("%s", sep); // no \n pos += sep_len; } diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 940f622462..be4947afa6 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -85,6 +85,9 @@ #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" +#include "io_uring.h" + +#include "compel/plugins/std/syscall-codes.h" /* * Architectures can overwrite this function to restore register sets that @@ -191,10 +194,11 @@ struct cr_imgset *glob_imgset; static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) { + char buf[PATH_MAX] = {}; struct dirent *de; - DIR *fd_dir; + int n, pidfd = -1; int size = 0; - int n; + DIR *fd_dir; pr_info("\n"); pr_info("Collecting fds (pid: %d)\n", pid); @@ -204,6 +208,59 @@ static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) if (!fd_dir) return -1; + /* Before collecting fds, we need to bring io_uring to steady state, + * since it can install fds into task's fdtable, and if we do it later, + * during actual io_uring dump, we will miss dumping these files. + */ + while ((de = readdir(fd_dir))) { + if (dir_dots(de)) + continue; + + n = dirfd(fd_dir); + if (n == -1) { + close(pidfd); + return -1; + } + + n = readlinkat(n, de->d_name, buf, sizeof(buf)); + if (n == -1) { + close(pidfd); + return -1; + } + + if (is_io_uring_link(buf)) { + if (!kdat.has_pidfd_open) { + pr_err("pidfd_open system call not supported\n"); + return -ENOTSUP; + } + + if (!kdat.has_pidfd_getfd) { + pr_err("pidfd_getfd system call not supported\n"); + return -ENOTSUP; + } + + if (pidfd == -1) { + pidfd = syscall(SYS_pidfd_open, pid, 0); + if (pidfd < 0) { + pr_err("Failed to open pidfd for pid %d\n", pid); + return pidfd; + } + } + + if (io_uring_synchronize_fd(syscall(SYS_pidfd_getfd, pidfd, atoi(de->d_name), 0))) { + pr_err("Failed to synchronize io_uring fd %d for pid %d\n", atoi(de->d_name), pid); + close(pidfd); + return -1; + } + } + } + + if (pidfd >= 0) + close(pidfd); + + /* Collect fds now */ + rewinddir(fd_dir); + n = 0; while ((de = readdir(fd_dir))) { if (dir_dots(de)) @@ -489,6 +546,8 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, const struc ret = check_sysvipc_map_dump(pid, vma); else if (vma_entry_is(vma, VMA_AREA_SOCKET)) ret = dump_socket_map(vma_area); + else if (vma_entry_is(vma, VMA_AREA_IO_URING)) + ret = dump_io_uring_map(vma_area); else ret = 0; if (ret) @@ -615,7 +674,7 @@ static int dump_task_kobj_ids(struct pstree_item *item) TaskKobjIdsEntry *ids = item->ids; elem.pid = pid; - elem.idx = 0; /* really 0 for all */ + elem.idx = 0; /* really 0 for all */ elem.genid = 0; /* FIXME optimize */ new = 0; @@ -1129,6 +1188,13 @@ static int dump_zombies(void) item->pgid = pps_buf.pgid; BUG_ON(!list_empty(&item->children)); + + if (!item->sid) { + pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", + item->pid->real, vpid(item)); + goto err; + } + if (dump_one_zombie(item, &pps_buf) < 0) goto err; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9d2d957f85..01e6749ed5 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -29,6 +29,7 @@ #include "servicefd.h" #include "image.h" #include "img-streamer.h" +#include "io_uring.h" #include "util.h" #include "util-pie.h" #include "criu-log.h" @@ -277,7 +278,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &io_uring_cinfo, }; /* These images are required to restore namespaces */ @@ -2388,6 +2389,29 @@ static int restore_root_task(struct pstree_item *init) pr_err("Unable to flush breakpoints\n"); finalize_restore(); + /* + * Some external devices such as GPUs might need a very late + * trigger to kick-off some events, memory notifiers and for + * restarting the previously restored queues during criu restore + * stage. This is needed since criu pie code may shuffle VMAs + * around so things such as registering MMU notifiers (for GPU + * mapped memory) could be done sanely once the pie code hands + * over the control to master process. + */ + for_each_pstree_item(item) { + pr_info("Run late stage hook from criu master for external devices\n"); + ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); + /* + * This may not really be an error. Only certain plugin hooks + * (if available) will return success such as amdgpu_plugin that + * validates the pid of the resuming tasks in the kernel mode. + * Most of the times, it'll be -ENOTSUP and in few cases, it + * might actually be a true error code but that would be also + * captured in the plugin so no need to print the error here. + */ + if (ret < 0) + pr_debug("restore late stage hook for external plugin failed\n"); + } ret = run_scripts(ACT_PRE_RESUME); if (ret) diff --git a/criu/cr-service.c b/criu/cr-service.c index 0f8bc4cc10..59f46b3201 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -169,11 +169,11 @@ int send_criu_dump_resp(int socket_fd, bool success, bool restored) return send_criu_msg(socket_fd, &msg); } -static int send_criu_pre_dump_resp(int socket_fd, bool success) +static int send_criu_pre_dump_resp(int socket_fd, bool success, bool single) { CriuResp msg = CRIU_RESP__INIT; - msg.type = CRIU_REQ_TYPE__PRE_DUMP; + msg.type = single ? CRIU_REQ_TYPE__SINGLE_PRE_DUMP : CRIU_REQ_TYPE__PRE_DUMP; msg.success = success; set_resp_err(&msg); @@ -735,6 +735,7 @@ static int dump_using_req(int sk, CriuOpts *req) bool success = false; bool self_dump = !req->pid; + opts.mode = CR_DUMP; if (setup_opts_from_req(sk, req)) goto exit; @@ -777,6 +778,7 @@ static int restore_using_req(int sk, CriuOpts *req) opts.restore_detach = true; + opts.mode = CR_RESTORE; if (setup_opts_from_req(sk, req)) goto exit; @@ -828,6 +830,7 @@ static int check(int sk, CriuOpts *req) if (pid == 0) { setproctitle("check --rpc"); + opts.mode = CR_CHECK; if (setup_opts_from_req(sk, req)) exit(1); @@ -845,7 +848,7 @@ static int check(int sk, CriuOpts *req) return send_criu_msg(sk, &resp); } -static int pre_dump_using_req(int sk, CriuOpts *req) +static int pre_dump_using_req(int sk, CriuOpts *req, bool single) { int pid, status; bool success = false; @@ -859,6 +862,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req) if (pid == 0) { int ret = 1; + opts.mode = CR_PRE_DUMP; if (setup_opts_from_req(sk, req)) goto cout; @@ -886,7 +890,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req) success = true; out: - if (send_criu_pre_dump_resp(sk, success) == -1) { + if (send_criu_pre_dump_resp(sk, success, single) == -1) { pr_perror("Can't send pre-dump resp"); success = false; } @@ -899,7 +903,7 @@ static int pre_dump_loop(int sk, CriuReq *msg) int ret; do { - ret = pre_dump_using_req(sk, msg->opts); + ret = pre_dump_using_req(sk, msg->opts, false); if (ret < 0) return ret; @@ -936,6 +940,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) if (pid == 0) { close(start_pipe[0]); + opts.mode = CR_PAGE_SERVER; if (setup_opts_from_req(sk, req)) goto out_ch; @@ -1182,6 +1187,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; + opts.mode = CR_CPUINFO; if (setup_opts_from_req(sk, msg->opts)) goto cout; @@ -1231,6 +1237,8 @@ int cr_service_work(int sk) CriuReq *msg = 0; more: + opts.mode = CR_SWRK; + if (recv_criu_msg(sk, &msg) != 0) { pr_perror("Can't recv request"); goto err; @@ -1271,6 +1279,9 @@ int cr_service_work(int sk) case CRIU_REQ_TYPE__VERSION: ret = handle_version(sk, msg); break; + case CRIU_REQ_TYPE__SINGLE_PRE_DUMP: + ret = pre_dump_using_req(sk, msg->opts, true); + break; default: send_criu_err(sk, "Invalid req"); diff --git a/criu/crtools.c b/criu/crtools.c index 6a75cd1ea2..0752800f6f 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -56,17 +56,58 @@ void flush_early_log_to_stderr(void) static int image_dir_mode(char *argv[], int optind) { - if (!strcmp(argv[optind], "dump") || !strcmp(argv[optind], "pre-dump") || - (!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "dump"))) + switch (opts.mode) { + case CR_DUMP: + /* fallthrough */ + case CR_PRE_DUMP: return O_DUMP; - - if (!strcmp(argv[optind], "restore") || - (!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "restore"))) + case CR_RESTORE: return O_RSTR; + case CR_CPUINFO: + if (!strcmp(argv[optind + 1], "dump")) + return O_DUMP; + /* fallthrough */ + default: + return -1; + } + /* never reached */ + BUG(); return -1; } +static int parse_criu_mode(char *mode) +{ + if (!strcmp(mode, "dump")) + opts.mode = CR_DUMP; + else if (!strcmp(mode, "pre-dump")) + opts.mode = CR_PRE_DUMP; + else if (!strcmp(mode, "restore")) + opts.mode = CR_RESTORE; + else if (!strcmp(mode, "lazy-pages")) + opts.mode = CR_LAZY_PAGES; + else if (!strcmp(mode, "check")) + opts.mode = CR_CHECK; + else if (!strcmp(mode, "page-server")) + opts.mode = CR_PAGE_SERVER; + else if (!strcmp(mode, "service")) + opts.mode = CR_SERVICE; + else if (!strcmp(mode, "swrk")) + opts.mode = CR_SWRK; + else if (!strcmp(mode, "dedup")) + opts.mode = CR_DEDUP; + else if (!strcmp(mode, "cpuinfo")) + opts.mode = CR_CPUINFO; + else if (!strcmp(mode, "exec")) + opts.mode = CR_EXEC_DEPRECATED; + else if (!strcmp(mode, "show")) + opts.mode = CR_SHOW_DEPRECATED; + else + return -1; + + return 0; +} + int main(int argc, char *argv[], char *envp[]) { int ret = -1; @@ -106,7 +147,30 @@ int main(int argc, char *argv[], char *envp[]) log_set_loglevel(opts.log_level); - if (optind < argc && !strcmp(argv[optind], "swrk")) { + /* + * There kernel might send us lethal signals in the following cases: + * 1) Writing a pipe which reader has disappeared. + * 2) Writing to a socket of type SOCK_STREAM which is no longer connected. + * We deal with write()/Send() failures on our own, and prefer not to get killed. + * So we ignore SIGPIPEs. + * + * Pipes are used in various places: + * 1) Receiving application page data + * 2) Transmitting data to the image streamer + * 3) Emitting logs (potentially to a pipe). + * Sockets are mainly used in transmitting memory data. + */ + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { + pr_perror("Failed to set a SIGPIPE signal ignore."); + return 1; + } + + if (parse_criu_mode(argv[optind])) { + pr_err("unknown command: %s\n", argv[optind]); + goto usage; + } + + if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); return 1; @@ -138,7 +202,7 @@ int main(int argc, char *argv[], char *envp[]) goto usage; } - if (strcmp(argv[optind], "restore")) { + if (opts.mode != CR_RESTORE) { pr_err("--exec-cmd is available for the restore command only\n"); goto usage; } @@ -155,9 +219,12 @@ int main(int argc, char *argv[], char *envp[]) opts.exec_cmd[argc - optind - 1] = NULL; } else { /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (strcmp(argv[optind], "cpuinfo") && has_sub_command) { + if (opts.mode != CR_CPUINFO && has_sub_command) { pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); goto usage; + } else if (opts.mode == CR_CPUINFO && !has_sub_command) { + pr_err("cpuinfo requires an action: dump or check\n"); + goto usage; } } @@ -167,7 +234,7 @@ int main(int argc, char *argv[], char *envp[]) } /* We must not open imgs dir, if service is called */ - if (strcmp(argv[optind], "service")) { + if (opts.mode != CR_SERVICE) { ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); @@ -175,27 +242,11 @@ int main(int argc, char *argv[], char *envp[]) } } - /* - * The kernel might send us lethal signals when writing to a pipe - * which reader has disappeared. We deal with write() failures on our - * own, and prefer not to get killed. So we ignore SIGPIPEs. - * - * Pipes are used in various places: - * 1) Receiving application page data - * 2) Transmitting data to the image streamer - * 3) Emitting logs (potentially to a pipe). - */ - if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { - pr_perror("Failed to set a SIGPIPE signal ignore."); - return 1; - } - /* * When a process group becomes an orphan, * its processes are sent a SIGHUP signal */ - if (!strcmp(argv[optind], "restore") && opts.restore_detach && opts.final_state == TASK_STOPPED && - opts.shell_job) + if (opts.mode == CR_RESTORE && opts.restore_detach && opts.final_state == TASK_STOPPED && opts.shell_job) pr_warn("Stopped and detached shell job will get SIGHUP from OS.\n"); if (chdir(opts.work_dir)) { @@ -215,7 +266,7 @@ int main(int argc, char *argv[], char *envp[]) kdat.can_map_vdso = 0; if (!list_empty(&opts.inherit_fds)) { - if (strcmp(argv[optind], "restore")) { + if (opts.mode != CR_RESTORE) { pr_err("--inherit-fd is restore-only option\n"); return 1; } @@ -226,13 +277,14 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (!strcmp(argv[optind], "dump")) { + if (opts.mode == CR_DUMP) { if (!opts.tree_id) goto opt_pid_missing; + return cr_dump_tasks(opts.tree_id); } - if (!strcmp(argv[optind], "pre-dump")) { + if (opts.mode == CR_PRE_DUMP) { if (!opts.tree_id) goto opt_pid_missing; @@ -244,7 +296,7 @@ int main(int argc, char *argv[], char *envp[]) return cr_pre_dump_tasks(opts.tree_id) != 0; } - if (!strcmp(argv[optind], "restore")) { + if (opts.mode == CR_RESTORE) { if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -259,22 +311,22 @@ int main(int argc, char *argv[], char *envp[]) return ret != 0; } - if (!strcmp(argv[optind], "lazy-pages")) + if (opts.mode == CR_LAZY_PAGES) return cr_lazy_pages(opts.daemon_mode) != 0; - if (!strcmp(argv[optind], "check")) + if (opts.mode == CR_CHECK) return cr_check() != 0; - if (!strcmp(argv[optind], "page-server")) + if (opts.mode == CR_PAGE_SERVER) return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (!strcmp(argv[optind], "service")) + if (opts.mode == CR_SERVICE) return cr_service(opts.daemon_mode); - if (!strcmp(argv[optind], "dedup")) + if (opts.mode == CR_DEDUP) return cr_dedup() != 0; - if (!strcmp(argv[optind], "cpuinfo")) { + if (opts.mode == CR_CPUINFO) { if (!argv[optind + 1]) { pr_err("cpuinfo requires an action: dump or check\n"); goto usage; @@ -285,12 +337,12 @@ int main(int argc, char *argv[], char *envp[]) return cpuinfo_check(); } - if (!strcmp(argv[optind], "exec")) { + if (opts.mode == CR_EXEC_DEPRECATED) { pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; } - if (!strcmp(argv[optind], "show")) { + if (opts.mode == CR_SHOW_DEPRECATED) { pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; @@ -388,6 +440,8 @@ int main(int argc, char *argv[], char *envp[]) " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" " -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" " -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n" + " --timeout NUM a timeout (in seconds) on collecting tasks during dump\n" + " (default 10 seconds)\n" " --force-irmap force resolving names for inotify/fsnotify watches\n" " --irmap-scan-path FILE\n" " add a path the irmap hints to scan\n" diff --git a/criu/fdstore.c b/criu/fdstore.c index 77935484fd..65264a5116 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -13,10 +13,12 @@ #include "rst-malloc.h" #include "log.h" +/* clang-format off */ static struct fdstore_desc { int next_id; mutex_t lock; /* to protect a peek offset */ -} * desc; +} *desc; +/* clang-format on */ int fdstore_init(void) { diff --git a/criu/file-ids.c b/criu/file-ids.c index 1b9d688882..772bd92cf0 100644 --- a/criu/file-ids.c +++ b/criu/file-ids.c @@ -77,8 +77,14 @@ int fd_id_generate_special(struct fd_parms *p, u32 *id) fi = fd_id_cache_lookup(p); if (fi) { - *id = fi->id; - return 0; + if (p->stat.st_mode & (S_IFCHR | S_IFBLK)) { + /* Don't cache the id for mapped devices */ + *id = fd_tree.subid++; + return 1; + } else { + *id = fi->id; + return 0; + } } } diff --git a/criu/files-reg.c b/criu/files-reg.c index ee54d1d7d3..6759e00e59 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -959,7 +959,25 @@ void free_link_remaps(void) } static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags); -static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_id *nsid, const struct stat *st) +static void check_overlayfs_fallback(char *path, const struct fd_parms *parms, bool *fallback) +{ + if (!fallback || parms->fs_type != OVERLAYFS_SUPER_MAGIC) + return; + + /* + * In overlayFS, linkat() fails with ENOENT if the removed file is + * originated from lower layer. The cause of failure is that linkat() + * sees the file has st_nlink=0, which is different than st_nlink=1 we + * got from earlier fstat() on lfd. By setting *fb=true, we will fall + * back to dump_ghost_remap() as it is what should have been done to + * removed files with st_nlink=0. + */ + pr_info("Unable to link-remap %s on overlayFS, fall back to dump_ghost_remap\n", path); + *fallback = true; +} + +static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_id *nsid, const struct fd_parms *parms, + bool *fallback) { char link_name[PATH_MAX], *tmp; FileEntry fe = FILE_ENTRY__INIT; @@ -967,6 +985,7 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i FownEntry fwn = FOWN_ENTRY__INIT; int mntns_root; int ret; + const struct stat *ost = &parms->stat; if (!opts.link_remap_ok) { pr_err("Can't create link remap for %s. " @@ -1005,11 +1024,12 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i mntns_root = mntns_get_root_fd(nsid); again: - ret = linkat_hard(lfd, "", mntns_root, link_name, st->st_uid, st->st_gid, AT_EMPTY_PATH); + ret = linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH); if (ret < 0 && errno == ENOENT) { /* Use grand parent, if parent directory does not exist. */ if (trim_last_parent(link_name) < 0) { pr_err("trim failed: @%s@\n", link_name); + check_overlayfs_fallback(path, parms, fallback); return -1; } goto again; @@ -1028,12 +1048,13 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } -static int dump_linked_remap(char *path, int len, const struct stat *ost, int lfd, u32 id, struct ns_id *nsid) +static int dump_linked_remap(char *path, int len, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid, + bool *fallback) { u32 lid; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; - if (create_link_remap(path, len, lfd, &lid, nsid, ost)) + if (create_link_remap(path, len, lfd, &lid, nsid, parms, fallback)) return -1; rpe.orig_id = id; @@ -1150,6 +1171,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, struct stat pst; const struct stat *ost = &parms->stat; int flags = 0; + bool fallback = false; if (parms->fs_type == PROC_SUPER_MAGIC) { /* The file points to /proc/pid/ where pid is a dead @@ -1239,7 +1261,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * links on it) to have some persistent name at hands. */ pr_debug("Dump silly-rename linked remap for %x\n", id); - return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); + return dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, NULL); } mntns_root = mntns_get_root_fd(nsid); @@ -1260,7 +1282,15 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, if (errno == ENOENT) { link_strip_deleted(link); - return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); + ret = dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, &fallback); + if (ret < 0 && fallback) { + /* fallback is true only if following conditions are true: + * 1. linkat() inside dump_linked_remap() failed with ENOENT + * 2. parms->fs_type == overlayFS + */ + return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); + } + return ret; } pr_perror("Can't stat path"); @@ -1792,30 +1822,42 @@ static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, return ret; } -static void rm_parent_dirs(int mntns_root, char *path, int count) +static int rm_parent_dirs(int mntns_root, char *path, int count) { char *p, *prev = NULL; + int ret = -1; - if (!count) - return; - - while (count > 0) { - count -= 1; + while (count-- > 0) { p = strrchr(path, '/'); - if (p) + if (p) { + /* We don't handle "//" in path */ + BUG_ON(prev && (prev - p == 1)); *p = '\0'; + } else { + /* Inconsistent path and count */ + pr_perror("Can't strrchr \"/\" in \"%s\"/\"%s\"]" + " left count=%d\n", + path, prev ? prev + 1 : "", count + 1); + goto err; + } + if (prev) *prev = '/'; + prev = p; - if (unlinkat(mntns_root, path, AT_REMOVEDIR)) + if (unlinkat(mntns_root, path, AT_REMOVEDIR)) { pr_perror("Can't remove %s AT %d", path, mntns_root); - else - pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); - prev = p; + goto err; + } + pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); } + ret = 0; +err: if (prev) *prev = '/'; + + return ret; } /* Construct parent dir name and mkdir parent/grandparents if they're not exist */ @@ -1847,6 +1889,7 @@ static int make_parent_dirs_if_need(int mntns_root, char *path) err = mkdirat(mntns_root, path, 0777); if (err && errno != EEXIST) { pr_perror("Can't create dir: %s AT %d", path, mntns_root); + /* Failing anyway -> no retcode check */ rm_parent_dirs(mntns_root, path, count); count = -1; goto out; @@ -1867,6 +1910,9 @@ static int make_parent_dirs_if_need(int mntns_root, char *path) * This routine properly resolves d's path handling ghost/link-remaps. * The open_cb is a routine that does actual open, it differs for * files, directories, fifos, etc. + * + * Return 0 on success, -1 on error and 1 to indicate soft error, which can be + * retried. */ static int rfi_remap(struct reg_file_info *rfi, int *level) @@ -1930,8 +1976,11 @@ static int rfi_remap(struct reg_file_info *rfi, int *level) if (linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->uid, rfi->remap->gid, 0) < 0) { int errno_saved = errno; - rm_parent_dirs(mntns_root, path, *level); - errno = errno_saved; + + if (!rm_parent_dirs(mntns_root, path, *level) && errno_saved == EEXIST) { + errno = errno_saved; + return 1; + } return -1; } @@ -2008,11 +2057,12 @@ static bool validate_file(const int fd, const struct stat *fd_status, const stru int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) { - int tmp, mntns_root, level = 0; + int tmp = -1, mntns_root, level = 0; struct reg_file_info *rfi; char *orig_path = NULL; char path[PATH_MAX]; int inh_fd = -1; + int ret; if (inherited_fd(d, &tmp)) return tmp; @@ -2049,14 +2099,9 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil */ orig_path = rfi->path; rfi->path = rfi->remap->rpath; - } else if (rfi_remap(rfi, &level) < 0) { + } else if ((ret = rfi_remap(rfi, &level)) == 1) { static char tmp_path[PATH_MAX]; - if (errno != EEXIST) { - pr_perror("Can't link %s -> %s", rfi->remap->rpath, rfi->path); - return -1; - } - /* * The file whose name we're trying to create * exists. Need to pick some other one, we're @@ -2070,12 +2115,15 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil orig_path = rfi->path; rfi->path = tmp_path; snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path); - pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->rpath); + pr_debug("Fake %s -> %s link\n", rfi->remap->rpath, rfi->path); - if (rfi_remap(rfi, &level) < 0) { + if (rfi_remap(rfi, &level)) { pr_perror("Can't create even fake link!"); - return -1; + goto err; } + } else if (ret < 0) { + pr_perror("Can't link %s -> %s", rfi->remap->rpath, rfi->path); + goto err; } } @@ -2085,7 +2133,7 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil if (tmp < 0) { pr_perror("Can't open file %s", rfi->path); close_safe(&inh_fd); - return -1; + goto err; } close_safe(&inh_fd); @@ -2094,15 +2142,15 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil if (fstat(tmp, &st) < 0) { pr_perror("Can't fstat opened file"); - return -1; + goto err; } if (!validate_file(tmp, &st, rfi)) - return -1; + goto err; if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); - return -1; + goto err; } /* @@ -2115,8 +2163,18 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil if (rfi->remap) { if (!rfi->remap->is_dir) { - unlinkat(mntns_root, rfi->path, 0); - rm_parent_dirs(mntns_root, rfi->path, level); + struct mount_info *mi = lookup_mnt_id(rfi->rfe->mnt_id); + + if (mi && try_remount_writable(mi, true)) + goto err; + + pr_debug("Unlink: %d:%s\n", rfi->rfe->mnt_id, rfi->path); + if (unlinkat(mntns_root, rfi->path, 0)) { + pr_perror("Failed to unlink the remap file"); + goto err; + } + if (rm_parent_dirs(mntns_root, rfi->path, level)) + goto err; } mutex_unlock(remap_open_lock); @@ -2124,10 +2182,17 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil if (orig_path) rfi->path = orig_path; - if (restore_fown(tmp, rfi->rfe->fown)) + if (restore_fown(tmp, rfi->rfe->fown)) { + close(tmp); return -1; + } return tmp; +err: + if (rfi->remap) + mutex_unlock(remap_open_lock); + close_safe(&tmp); + return -1; } int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg) @@ -2236,8 +2301,8 @@ static struct filemap_ctx ctx; void filemap_ctx_init(bool auto_close) { ctx.desc = NULL; /* to fail the first comparison in open_ */ - ctx.fd = -1; /* not to close random fd in _fini */ - ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ + ctx.fd = -1; /* not to close random fd in _fini */ + ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ /* flags may remain any */ ctx.close = auto_close; } @@ -2267,6 +2332,23 @@ static int open_filemap(int pid, struct vma_area *vma) BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); flags = vma->e->fdflags; + /* update the new device file page offsets and file paths set during restore */ + if (vma->e->status & VMA_UNSUPP) { + uint64_t new_pgoff; + char new_path[PATH_MAX]; + int ret; + + struct reg_file_info *rfi = container_of(vma->vmfd, struct reg_file_info, d); + ret = run_plugins(UPDATE_VMA_MAP, rfi->rfe->name, new_path, vma->e->start, vma->e->pgoff, &new_pgoff); + if (ret == 1) { + pr_info("New mmap %#016" PRIx64 "->%#016" PRIx64 " path %s\n", vma->e->pgoff, new_pgoff, + new_path); + vma->e->pgoff = new_pgoff; + rfi->path = xstrdup(new_path); + pr_debug("Updated rfi->path %s\n", rfi->path); + } + } + if (ctx.flags != flags || ctx.desc != vma->vmfd) { if (vma->e->status & VMA_AREA_MEMFD) ret = memfd_open(vma->vmfd, &flags); diff --git a/criu/files.c b/criu/files.c index 93754fb440..256ad5821f 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "io_uring.h" #include "protobuf.h" #include "util.h" @@ -506,7 +507,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, } p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ - p.dfds = dfds; /* epoll needs to verify if target fd exist */ + p.dfds = dfds; /* epoll needs to verify if target fd exist */ if (S_ISSOCK(p.stat.st_mode)) return dump_socket(&p, lfd, e); @@ -536,6 +537,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; #endif + else if (is_io_uring_link(link)) + ops = &io_uring_dump_ops; else return dump_unsupp_fd(&p, lfd, "anon", link, e); @@ -1486,7 +1489,7 @@ int shared_fdt_prepare(struct pstree_item *item) struct inherit_fd { struct list_head inh_list; char *inh_id; /* file identifier */ - int inh_fd; /* criu's descriptor to inherit */ + int inh_fd; /* criu's descriptor to inherit */ int inh_fd_id; }; diff --git a/criu/fsnotify.c b/criu/fsnotify.c index b5dd15dd89..22fb749731 100644 --- a/criu/fsnotify.c +++ b/criu/fsnotify.c @@ -132,7 +132,7 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ if (!mnt_is_dir(m)) continue; - mntfd = __open_mountpoint(m, -1); + mntfd = __open_mountpoint(m); pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n", m->mnt_id, m->root, m->ns_mountpoint, mntfd); if (mntfd < 0) @@ -206,7 +206,7 @@ static int open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handl if (m->s_dev != s_dev || !mnt_is_dir(m)) continue; - mntfd = __open_mountpoint(m, -1); + mntfd = __open_mountpoint(m); if (mntfd < 0) { pr_warn("Can't open mount for s_dev %x, continue\n", s_dev); continue; diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986..b72df0d98e 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,8 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY_F(IO_URING_FILE, "io_uring-file", O_NOBUF), + FD_ENTRY_F(IO_URING_DATA, "io_uring-data", O_NOBUF), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/aio.h b/criu/include/aio.h index f8a59dfdff..d1655739d9 100644 --- a/criu/include/aio.h +++ b/criu/include/aio.h @@ -13,8 +13,8 @@ struct task_restore_args; int prepare_aios(struct pstree_item *t, struct task_restore_args *ta); struct aio_ring { - unsigned id; /* kernel internal index number */ - unsigned nr; /* number of io_events */ + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; diff --git a/criu/include/autofs.h b/criu/include/autofs.h index c4e0f23ed7..b158025c7f 100644 --- a/criu/include/autofs.h +++ b/criu/include/autofs.h @@ -96,7 +96,7 @@ struct args_ismountpoint { struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; - __u32 size; /* total size of data passed in + __u32 size; /* total size of data passed in * including this struct */ __s32 ioctlfd; /* automount command fd */ diff --git a/criu/include/bfd.h b/criu/include/bfd.h index 4268f74d4a..2846ec6286 100644 --- a/criu/include/bfd.h +++ b/criu/include/bfd.h @@ -5,8 +5,8 @@ struct bfd_buf; struct xbuf { - char *mem; /* buffer */ - char *data; /* position we see bytes at */ + char *mem; /* buffer */ + char *data; /* position we see bytes at */ unsigned int sz; /* bytes sitting after b->pos */ struct bfd_buf *buf; }; diff --git a/criu/include/bpf-util.h b/criu/include/bpf-util.h new file mode 100644 index 0000000000..b5b9713198 --- /dev/null +++ b/criu/include/bpf-util.h @@ -0,0 +1,16 @@ +#ifndef __CR_BPF_UTIL_H__ +#define __CR_BPF_UTIL_H__ + +#include +#include + +struct bpf_fdtable { + int file2index_map_fd; + int index2file_map_fd; +}; + +int bpf_fill_task_fdtable(pid_t tgid, struct bpf_fdtable *meta); +int bpf_fill_io_uring_fdtable(int io_uring_fd, struct bpf_fdtable *meta); +int bpf_fill_epoll_fdtable(int epoll_fd, struct bpf_fdtable *meta); + +#endif diff --git a/criu/include/bpf_insn.h b/criu/include/bpf_insn.h new file mode 100644 index 0000000000..29c3bb6ad1 --- /dev/null +++ b/criu/include/bpf_insn.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* eBPF instruction mini library */ +#ifndef __BPF_INSN_H +#define __BPF_INSN_H + +struct bpf_insn; + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +#define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_ALU32_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Short form of mov, dst_reg = src_reg */ + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* Short form of mov, dst_reg = imm32 */ + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + +#ifndef BPF_PSEUDO_MAP_FD +# define BPF_PSEUDO_MAP_FD 1 +#endif + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + + +/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Raw code statement block */ + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +/* Program exit */ + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#endif diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index a34f8dbbf4..85648bf1c8 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -100,6 +100,22 @@ struct irmap_path_opt { struct irmap *ir; }; +enum criu_mode { + CR_UNSET = 0, + CR_DUMP, + CR_PRE_DUMP, + CR_RESTORE, + CR_LAZY_PAGES, + CR_CHECK, + CR_PAGE_SERVER, + CR_SERVICE, + CR_SWRK, + CR_DEDUP, + CR_CPUINFO, + CR_EXEC_DEPRECATED, + CR_SHOW_DEPRECATED, +}; + struct cr_options { int final_state; int check_extra_features; @@ -188,6 +204,9 @@ struct cr_options { /* This stores which method to use for file validation. */ int file_validation_method; + + /* Shows the mode criu is running at the moment: dump/pre-dump/restore/... */ + enum criu_mode mode; }; extern struct cr_options opts; diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 897666ecdd..0bc7a4255a 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -22,6 +22,8 @@ #include #include +#include +#include #define CRIU_PLUGIN_GEN_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c)) #define CRIU_PLUGIN_VERSION_MAJOR 0 @@ -48,6 +50,12 @@ enum { CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6, + CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA = 7, + + CR_PLUGIN_HOOK__UPDATE_VMA_MAP = 8, + + CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, + CR_PLUGIN_HOOK__MAX }; @@ -60,6 +68,10 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct stat *stat); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *old_path, char *new_path, const uint64_t addr, + const uint64_t old_pgoff, uint64_t *new_pgoff); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); enum { CR_PLUGIN_STAGE__DUMP, @@ -130,5 +142,9 @@ typedef int(cr_plugin_restore_file_t)(int id); typedef int(cr_plugin_dump_ext_mount_t)(char *mountpoint, int id); typedef int(cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file); typedef int(cr_plugin_dump_ext_link_t)(int index, int type, char *kind); +typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); +typedef int(cr_plugin_update_vma_map_t)(const char *old_path, char *new_path, const uint64_t addr, + const uint64_t old_pgoff, uint64_t *new_pgoff); +typedef int(cr_plugin_resume_devices_late_t)(int pid); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/include/file-lock.h b/criu/include/file-lock.h index 0ce2fa3409..9ab79b66b1 100644 --- a/criu/include/file-lock.h +++ b/criu/include/file-lock.h @@ -30,12 +30,12 @@ #define LOCK_SH 1 /* shared lock */ #define LOCK_EX 2 /* exclusive lock */ #define LOCK_NB \ - 4 /* or'd with one of the above to prevent + 4 /* or'd with one of the above to prevent blocking */ #define LOCK_UN 8 /* remove lock */ -#define LOCK_MAND 32 /* This is a mandatory flock ... */ -#define LOCK_READ 64 /* which allows concurrent read operations */ +#define LOCK_MAND 32 /* This is a mandatory flock ... */ +#define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ #define LOCK_RW 192 /* which allows concurrent read & write ops */ @@ -47,7 +47,7 @@ struct file_lock { int fl_kind; int fl_ltype; - pid_t fl_owner; /* process, which created the lock */ + pid_t fl_owner; /* process, which created the lock */ pid_t fl_holder; /* pid of fd on whose the lock is found */ int maj, min; unsigned long i_no; diff --git a/criu/include/files.h b/criu/include/files.h index 96face71ba..aadc09f736 100644 --- a/criu/include/files.h +++ b/criu/include/files.h @@ -82,8 +82,8 @@ enum { struct fdinfo_list_entry { struct list_head desc_list; /* To chain on @fd_info_head */ - struct file_desc *desc; /* Associated file descriptor */ - struct list_head ps_list; /* To chain per-task files */ + struct file_desc *desc; /* Associated file descriptor */ + struct list_head ps_list; /* To chain per-task files */ struct pstree_item *task; FdinfoEntry *fe; int pid; @@ -121,12 +121,12 @@ unsigned int find_unused_fd(struct pstree_item *, int hint_fd); struct fdinfo_list_entry *find_used_fd(struct pstree_item *, int fd); struct file_desc { - u32 id; /* File id, unique */ - struct hlist_node hash; /* Descriptor hashing and lookup */ - struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ - struct file_desc_ops *ops; /* Associated operations */ + u32 id; /* File id, unique */ + struct hlist_node hash; /* Descriptor hashing and lookup */ + struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ + struct file_desc_ops *ops; /* Associated operations */ struct list_head fake_master_list; /* To chain in the list of file_desc, which don't - have a fle in a task, that having permissions */ + * have a fle in a task, that having permissions */ }; struct fdtype_ops { diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index 46ac8aa27d..ad34f48915 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -53,4 +53,8 @@ #define AUTOFS_SUPER_MAGIC 0x0187 #endif +#ifndef OVERLAYFS_SUPER_MAGIC +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 5045baee80..5dd4ae822c 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -71,6 +71,8 @@ enum { CR_FD_MEMFD_INODE, CR_FD_BPFMAP_FILE, CR_FD_BPFMAP_DATA, + CR_FD_IO_URING_FILE, + CR_FD_IO_URING_DATA, _CR_FD_GLOB_TO, CR_FD_TMPFS_IMG, @@ -122,8 +124,8 @@ enum { /* file descriptors template */ struct cr_fd_desc_tmpl { const char *fmt; /* format for the name */ - u32 magic; /* magic in the header */ - int oflags; /* flags for image_open */ + u32 magic; /* magic in the header */ + int oflags; /* flags for image_open */ }; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; diff --git a/criu/include/image.h b/criu/include/image.h index 14659dbd24..13e0dbcc89 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -79,11 +79,12 @@ #define VMA_ANON_SHARED (1 << 8) #define VMA_ANON_PRIVATE (1 << 9) -#define VMA_AREA_SYSVIPC (1 << 10) -#define VMA_AREA_SOCKET (1 << 11) -#define VMA_AREA_VVAR (1 << 12) -#define VMA_AREA_AIORING (1 << 13) -#define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_SYSVIPC (1 << 10) +#define VMA_AREA_SOCKET (1 << 11) +#define VMA_AREA_VVAR (1 << 12) +#define VMA_AREA_AIORING (1 << 13) +#define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_IO_URING (1 << 15) #define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) diff --git a/criu/include/inet_diag.h b/criu/include/inet_diag.h index ea6f5e14e1..4996dd5563 100644 --- a/criu/include/inet_diag.h +++ b/criu/include/inet_diag.h @@ -31,7 +31,7 @@ struct inet_diag_req_compat { struct inet_diag_sockid id; __u32 idiag_states; /* States to dump */ - __u32 idiag_dbs; /* Tables to dump (NI) */ + __u32 idiag_dbs; /* Tables to dump (NI) */ }; struct inet_diag_req_v2 { diff --git a/criu/include/io_uring.h b/criu/include/io_uring.h new file mode 100644 index 0000000000..5c91a27245 --- /dev/null +++ b/criu/include/io_uring.h @@ -0,0 +1,108 @@ +#ifndef __CR_IO_URING_H__ +#define __CR_IO_URING_H__ + +#include + +#include "files.h" +#include "io_uring.pb-c.h" + +/* Definitions */ +struct __io_uring_restriction { + __u16 opcode; + union { + __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ + __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ + __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ + }; + __u8 resv; + __u32 resv2[3]; +}; + +#ifndef IORING_SETUP_IOPOLL +#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ +#endif +#ifndef IORING_SETUP_SQPOLL +#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ +#endif +#ifndef IORING_SETUP_SQ_AFF +#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ +#endif +#ifndef IORING_SETUP_CQSIZE +#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ +#endif +#ifndef IORING_SETUP_ATTACH_WQ +#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ +#endif +#ifndef IORING_SETUP_R_DISABLED +#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ +#endif + +#ifndef IORING_OFF_SQ_RING +#define IORING_OFF_SQ_RING 0ULL +#endif +#ifndef IORING_OFF_CQ_RING +#define IORING_OFF_CQ_RING 0x8000000ULL +#endif +#ifndef IORING_OFF_SQES +#define IORING_OFF_SQES 0x10000000ULL +#endif + +#ifndef IOSQE_IO_DRAIN +#define IOSQE_IO_DRAIN (1U << 1) +#endif + +#define __IORING_RESTRICTION_REGISTER_OP 0 +#define __IORING_RESTRICTION_SQE_OP 1 +#define __IORING_RESTRICTION_SQE_FLAGS_ALLOWED 2 +#define __IORING_RESTRICTION_SQE_FLAGS_REQUIRED 3 +#define __IORING_REGISTER_PERSONALITY 9 +#define __IORING_REGISTER_RESTRICTIONS 11 +#define __IORING_REGISTER_ENABLE_RINGS 12 + +struct io_uring_file_info { + IoUringFileEntry *iofe; + struct file_desc d; +}; + +struct io_uring_data_info { + IoUringDataEntry *iode; +}; + +struct io_uring_group_desc { + struct list_head list; + gid_t group; + char group_name[32]; +}; + +struct io_uring_personality_desc { + int id; + uid_t uid; + uid_t euid; + uid_t suid; + uid_t fsuid; + gid_t gid; + gid_t egid; + gid_t sgid; + gid_t fsgid; + u32 cap_eff[CR_CAP_SIZE]; + size_t nr_groups; + struct list_head group_list; +}; + +struct io_uring_ctx; + +extern struct collect_image_info io_uring_cinfo; +extern struct collect_image_info io_uring_data_cinfo; +extern const struct fdtype_ops io_uring_dump_ops; + +int is_io_uring_link(char *link); +int io_uring_synchronize_fd(int fd); +int collect_io_uring_map(struct vma_area *vma); +int dump_io_uring_map(struct vma_area *vma); +int add_one_io_uring_mapping(uint64_t offset, ino_t inode); + +int io_uring_push_buf(struct io_uring_ctx *ctx, unsigned int idx, long long unsigned int address, unsigned int len); +int io_uring_push_personality(struct io_uring_ctx *ctx, struct io_uring_personality_desc *desc); +IoUringFileEntry *io_uring_get_iofe(struct io_uring_ctx *ctx); + +#endif /* __CR_IO_URING_H__ */ diff --git a/criu/include/kcmp.h b/criu/include/kcmp.h index a6774be471..575135f801 100644 --- a/criu/include/kcmp.h +++ b/criu/include/kcmp.h @@ -18,8 +18,8 @@ enum kcmp_type { /* Slot for KCMP_EPOLL_TFD */ typedef struct { - uint32_t efd; /* epoll file descriptor */ - uint32_t tfd; /* target file number */ + uint32_t efd; /* epoll file descriptor */ + uint32_t tfd; /* target file number */ uint32_t toff; /* target offset within same numbered sequence */ } kcmp_epoll_slot_t; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 80bad7f11d..a28a95802e 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -18,7 +18,7 @@ extern int kerndat_init(void); enum pagemap_func { PM_UNKNOWN, - PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ + PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */ PM_FULL, }; @@ -74,6 +74,7 @@ struct kerndat_s { bool has_pidfd_getfd; bool has_nspid; bool has_nftables_concat; + bool has_sockopt_buf_lock; }; extern struct kerndat_s kdat; diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index 840d6277e1..9a3a28b100 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -8,13 +8,13 @@ #include #else enum fsconfig_command { - FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ - FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ - FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ - FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ - FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ - FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ }; #endif diff --git a/criu/include/magic.h b/criu/include/magic.h index 22d7218e45..b968828e72 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,8 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define IO_URING_FILE_MAGIC 0x55403656 /* Butyn */ +#define IO_URING_DATA_MAGIC 0x54194822 /* Ulyanovsk */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/mount.h b/criu/include/mount.h index 29d80c2a76..b959d131c4 100644 --- a/criu/include/mount.h +++ b/criu/include/mount.h @@ -72,19 +72,19 @@ struct mount_info { struct list_head children; struct list_head siblings; - struct list_head mnt_bind; /* circular list of derivatives of one real mount */ - struct list_head mnt_share; /* circular list of shared mounts */ + struct list_head mnt_bind; /* circular list of derivatives of one real mount */ + struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list; /* list of slave mounts */ - struct list_head mnt_slave; /* slave list entry */ - struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ - struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ - struct list_head mnt_notprop; /* temporary list used in can_mount_now */ + struct list_head mnt_slave; /* slave list entry */ + struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ + struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ + struct list_head mnt_notprop; /* temporary list used in can_mount_now */ struct list_head mnt_unbindable; /* list of mounts with delayed unbindable */ struct list_head postpone; int is_overmounted; - int remounted_rw; + int *remounted_rw; void *private; /* associated filesystem data */ }; @@ -100,7 +100,7 @@ static inline int collect_binfmt_misc(void) } #endif -extern struct mount_info *mnt_entry_alloc(void); +extern struct mount_info *mnt_entry_alloc(bool rst); extern void mnt_entry_free(struct mount_info *mi); extern int __mntns_get_root_fd(pid_t pid); @@ -109,7 +109,9 @@ extern int mntns_get_root_by_mnt_id(int mnt_id); extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id); extern int open_mount(unsigned int s_dev); -extern int __open_mountpoint(struct mount_info *pm, int mnt_fd); +extern int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo); +extern int check_mountpoint_fd(struct mount_info *pm, int mnt_fd); +extern int __open_mountpoint(struct mount_info *pm); extern int mnt_is_dir(struct mount_info *pm); extern int open_mountpoint(struct mount_info *pm); @@ -139,6 +141,7 @@ extern void clean_cr_time_mounts(void); extern bool add_skip_mount(const char *mountpoint); struct ns_id; +extern int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo); extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump); extern int check_mnt_id(void); diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index 034605917c..e2ea6e17f6 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -128,9 +128,9 @@ struct ns_id { */ union { int nsfd_id; /* a namespace descriptor id in fdstore */ - int ns_fd; /* a namespace file descriptor */ + int ns_fd; /* a namespace file descriptor */ }; - int nlsk; /* for sockets collection */ + int nlsk; /* for sockets collection */ int seqsk; /* to talk to parasite daemons */ struct list_head ids; struct list_head links; diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index a5f97678eb..15178c0150 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -90,14 +90,14 @@ struct kernel_pipe_buffer { */ struct page_pipe_buf { - int p[2]; /* pipe with pages */ + int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ - unsigned int nr_segs; /* how many iov-s are busy */ + unsigned int pipe_off; /* where this buf is started in a pipe */ + unsigned int pages_in; /* how many pages are there */ + unsigned int nr_segs; /* how many iov-s are busy */ #define PPB_LAZY (1 << 0) unsigned int flags; - struct iovec *iov; /* vaddr:len map */ + struct iovec *iov; /* vaddr:len map */ struct list_head l; /* links into page_pipe->bufs */ }; @@ -113,27 +113,25 @@ struct page_pipe_buf { #define PP_HOLE_PARENT (1 << 0) struct page_pipe { - unsigned int nr_pipes; /* how many page_pipe_bufs in there */ - struct list_head bufs; /* list of bufs */ - struct list_head free_bufs; /* list of bufs */ + unsigned int nr_pipes; /* how many page_pipe_bufs in there */ + struct list_head bufs; /* list of bufs */ + struct list_head free_bufs; /* list of bufs */ struct page_pipe_buf *prev[PP_PIPE_TYPES]; /* last ppb of each type for pipe sharing */ - unsigned int nr_iovs; /* number of iovs */ - unsigned int free_iov; /* first free iov */ + unsigned int nr_iovs; /* number of iovs */ + unsigned int free_iov; /* first free iov */ struct iovec *iovs; /* iovs. They are provided into create_page_pipe and all bufs have their iov-s in there */ - unsigned int nr_holes; /* number of holes allocated */ + unsigned int nr_holes; /* number of holes allocated */ unsigned int free_hole; /* number of holes in use */ - struct iovec *holes; /* holes */ + struct iovec *holes; /* holes */ unsigned int *hole_flags; unsigned int flags; /* PP_FOO flags below */ }; -#define PP_CHUNK_MODE \ - 0x1 /* Restrict the maximum buffer size of pipes - and dump memory for a few iterations */ -#define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ +#define PP_CHUNK_MODE 0x1 /* Restrict the maximum buffer size of pipes and dump memory for a few iterations */ +#define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags); extern void destroy_page_pipe(struct page_pipe *p); diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index e0303dfe0a..1bcd4ff205 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -36,7 +36,7 @@ struct page_xfer { union { struct /* local */ { struct cr_img *pmi; /* pagemaps */ - struct cr_img *pi; /* pages */ + struct cr_img *pi; /* pages */ }; struct /* page-server */ { diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 7612ee0f41..1d8bbffaf6 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -11,13 +11,13 @@ struct vma_area; #define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64)) typedef struct { - pid_t pid; /* which process it belongs */ - unsigned long start; /* start of area */ - unsigned long end; /* end of area */ + pid_t pid; /* which process it belongs */ + unsigned long start; /* start of area */ + unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ - u64 *map; /* local buffer */ - size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + u64 *map; /* local buffer */ + size_t map_len; /* length of a buffer */ + int fd; /* file to read PMs from */ } pmc_t; #define PMC_INIT \ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index c39c25d0cb..8c71805598 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -63,17 +63,14 @@ struct page_read { struct cr_img *pi; u32 pages_img_id; - PagemapEntry *pe; /* current pagemap we are on */ - struct page_read *parent; /* parent pagemap (if ->in_parent - pagemap is met in image, then - go to this guy for page, see - read_pagemap_page */ - unsigned long cvaddr; /* vaddr we are on */ - off_t pi_off; /* current offset in pages file */ - - struct iovec bunch; /* record consequent neighbour - iovecs to punch together */ - unsigned id; /* for logging */ + PagemapEntry *pe; /* current pagemap we are on */ + struct page_read *parent; /* parent pagemap (if ->in_parent pagemap is met in image, + * then go to this guy for page, see read_pagemap_page */ + unsigned long cvaddr; /* vaddr we are on */ + off_t pi_off; /* current offset in pages file */ + + struct iovec bunch; /* record consequent neighbour iovecs to punch together */ + unsigned id; /* for logging */ unsigned long img_id; /* pagemap image file ID */ PagemapEntry **pmes; diff --git a/criu/include/pipes.h b/criu/include/pipes.h index 6e6310e142..f442d7f65b 100644 --- a/criu/include/pipes.h +++ b/criu/include/pipes.h @@ -49,8 +49,8 @@ extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst struct pipe_info { PipeEntry *pe; struct list_head pipe_list; /* All pipe_info with the same pipe_id - * This is pure circular list without head */ - struct list_head list; /* global list of pipes */ + * This is pure circular list without head */ + struct list_head list; /* global list of pipes */ struct file_desc d; unsigned int create : 1, reopen : 1; }; diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f..dc4634978e 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,8 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_IO_URING_FILE, + PB_IO_URING_DATA, /* PB_AUTOGEN_STOP */ diff --git a/criu/include/pstree.h b/criu/include/pstree.h index c5b0fa7ea9..c1c79867b2 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -15,14 +15,14 @@ struct pstree_item { struct pstree_item *parent; struct list_head children; /* list of my children */ - struct list_head sibling; /* linkage in my parent's children list */ + struct list_head sibling; /* linkage in my parent's children list */ struct pid *pid; pid_t pgid; pid_t sid; pid_t born_sid; - int nr_threads; /* number of threads */ + int nr_threads; /* number of threads */ struct pid *threads; /* array of threads */ CoreEntry **core; TaskKobjIdsEntry *ids; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 934d60cf9a..308a0b79b3 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -138,7 +138,7 @@ struct task_restore_args { bool has_thp_enabled; /* threads restoration */ - int nr_threads; /* number of threads */ + int nr_threads; /* number of threads */ thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */ struct thread_restore_args *thread_args; /* array of thread arguments */ struct task_entries *task_entries; @@ -211,7 +211,7 @@ struct task_restore_args { bool can_map_vdso; bool auto_dedup; unsigned long vdso_rt_size; - struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ + struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ unsigned long vdso_rt_parked_at; /* safe place to keep vdso */ void **breakpoint; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 2e2107b0eb..9664e0a1ca 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -17,7 +17,7 @@ struct task_entries { }; struct fdt { - int nr; /* How many tasks share this fd table */ + int nr; /* How many tasks share this fd table */ pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index e75e8444c8..c6979de7f4 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -22,10 +22,10 @@ enum sfd_type { * - For dump -- target ns' proc * - For restore -- CRIU ns' proc */ - ROOT_FD_OFF, /* Root of the namespace we dump/restore */ + ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, - USERNSD_SK, /* Socket for usernsd */ - NS_FD_OFF, /* Node's net namespace fd */ + USERNSD_SK, /* Socket for usernsd */ + NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ RPC_SK_OFF, FDSTORE_SK_OFF, diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index c832d63877..5dd2a65518 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -35,7 +35,7 @@ struct inet_sk_desc { unsigned int dst_port; unsigned int state; unsigned int rqlen; - unsigned int wqlen; /* sent + unsent data */ + unsigned int wqlen; /* sent + unsent data */ unsigned int uwqlen; /* unsent data */ unsigned int src_addr[4]; unsigned int dst_addr[4]; diff --git a/criu/include/sockets.h b/criu/include/sockets.h index 3e8f3d6019..399d38664c 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -123,4 +123,8 @@ extern const char *socket_proto_name(unsigned int proto, char *nm, size_t size); #define ___socket_family_name(family) __socket_info_helper(socket_family_name, family) #define ___socket_proto_name(proto) __socket_info_helper(socket_proto_name, proto) +#ifndef SO_BUF_LOCK +#define SO_BUF_LOCK 72 +#endif + #endif /* __CR_SOCKETS_H__ */ diff --git a/criu/include/sysfs_parse.h b/criu/include/sysfs_parse.h index ff0e611486..f987d622f6 100644 --- a/criu/include/sysfs_parse.h +++ b/criu/include/sysfs_parse.h @@ -2,9 +2,9 @@ #define __CR_SYSFS_PARSE_H__ #define SYSFS_AUFS "/sys/fs/aufs/" -#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ +#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ #define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ -#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ +#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ struct mount_info; struct vma_area; diff --git a/criu/include/tls.h b/criu/include/tls.h index 26f9976fd7..f563c092c6 100644 --- a/criu/include/tls.h +++ b/criu/include/tls.h @@ -4,7 +4,7 @@ #ifdef CONFIG_GNUTLS int tls_x509_init(int sockfd, bool is_server); -void tls_terminate_session(void); +void tls_terminate_session(bool async); ssize_t tls_send(const void *buf, size_t len, int flags); ssize_t tls_recv(void *buf, size_t len, int flags); @@ -19,7 +19,7 @@ int tls_recv_data_to_fd(int fd, unsigned long len); #define tls_recv(buf, len, flags) (-1) #define tls_send_data_from_fd(fd, len) (-1) #define tls_recv_data_to_fd(fd, len) (-1) -#define tls_terminate_session() +#define tls_terminate_session(async) #endif /* CONFIG_HAS_GNUTLS */ diff --git a/criu/include/util.h b/criu/include/util.h index a2dac22335..19d378fc54 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -284,8 +284,8 @@ int setup_tcp_server(char *type, char *addr, unsigned short *port); int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk); int setup_tcp_client(char *hostname); -/* *dir should be writable and at least PATH_MAX long */ -int rm_rf(char *dir); +/* path should be writable and no more than PATH_MAX long */ +int rmrf(char *path); #define LAST_PID_PATH "sys/kernel/ns_last_pid" #define PID_MAX_PATH "sys/kernel/pid_max" diff --git a/criu/include/vma.h b/criu/include/vma.h index ed9f31ef67..864509881d 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -10,14 +10,14 @@ #include struct vm_area_list { - struct list_head h; /* list of VMAs */ - unsigned nr; /* nr of all VMAs in the list */ + struct list_head h; /* list of VMAs */ + unsigned nr; /* nr of all VMAs in the list */ unsigned int nr_aios; /* nr of AIOs VMAs in the list */ union { unsigned long nr_priv_pages; /* dmp: nr of pages in private VMAs */ unsigned long rst_priv_size; /* rst: size of private VMAs */ }; - unsigned long nr_priv_pages_longest; /* nr of pages in longest private VMA */ + unsigned long nr_priv_pages_longest; /* nr of pages in longest private VMA */ unsigned long nr_shared_pages_longest; /* nr of pages in longest shared VMA */ }; @@ -35,7 +35,10 @@ struct vma_area { union { struct /* for dump */ { - int vm_socket_id; + union { + int vm_socket_id; + int io_uring_id; + }; char *aufs_rpath; /* path from aufs root */ char *aufs_fpath; /* full path from global root */ @@ -53,8 +56,8 @@ struct vma_area { struct /* for restore */ { int (*vm_open)(int pid, struct vma_area *vma); struct file_desc *vmfd; - struct vma_area *pvma; /* parent for inherited VMAs */ - unsigned long *page_bitmap; /* existent pages */ + struct vma_area *pvma; /* parent for inherited VMAs */ + unsigned long *page_bitmap; /* existent pages */ unsigned long premmaped_addr; /* restore only */ /* diff --git a/criu/io_uring.c b/criu/io_uring.c new file mode 100644 index 0000000000..906e8f4ac5 --- /dev/null +++ b/criu/io_uring.c @@ -0,0 +1,1036 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "bitmap.h" +#include "fdinfo.h" +#include "imgset.h" +#include "string.h" +#include "file-ids.h" +#include "io_uring.h" +#include "protobuf.h" +#include "common/list.h" + +#include + +#define CTX_F_SEEN_SQE (1UL << 0) /* SQE ring mapped */ +#define CTX_F_SEEN_SQE_ARR (1UL << 1) /* SQE array mapped */ +#define CTX_F_SEEN_CQE (1UL << 2) /* CQE ring mapped */ +#define CTX_F_SEEN_RINGS (CTX_F_SEEN_SQE | CTX_F_SEEN_SQE_ARR | CTX_F_SEEN_CQE) +#define CTX_F_SINGLE_MMAP (1UL << 3) /* SQE/CQE ring are in single mapping */ +#define CTX_F_DONE_FILE (1UL << 4) /* File dump done */ +#define CTX_F_DONE_DATA (1UL << 5) /* Data dump done */ +#define CTX_F_DONE_ALL (CTX_F_DONE_FILE | CTX_F_DONE_DATA) +#define CTX_F_INIT_IOFE (1UL << 6) /* Iofe set for ctx */ + +#define atomic_load_relaxed(x) __atomic_load_n((x), __ATOMIC_RELAXED) +#define atomic_load_acquire(x) __atomic_load_n((x), __ATOMIC_ACQUIRE) +#define atomic_store_release(x, val) __atomic_store_n((x), (val), __ATOMIC_RELEASE) + +#define IO_URING_HASH_TABLE_BITS 5 +#define IO_URING_HASH_TABLE_MAX (1UL << IO_URING_HASH_TABLE_BITS) +#define IO_URING_HASH_TABLE_MASK (IO_URING_HASH_TABLE_MAX - 1) + +#ifndef IORING_FEAT_SQPOLL_NONFIXED +#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) +#endif + +struct io_uring_map { + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + unsigned int *sq_array; + unsigned int *sq_ring_tail; + unsigned int *sq_ring_head; + unsigned int *cqe_ring_head; + unsigned int *cqe_ring_tail; + unsigned int *sq_ring_mask; + unsigned int *cqe_ring_mask; + size_t sq_len; + size_t sqe_len; + size_t cqe_len; +}; + +struct io_uring_buf { + struct list_head list; + unsigned int idx; + long long unsigned int address; + unsigned int len; +}; + +/* We store uid name in image to avoid mismatch on restore which could turn into + * a potential security risk, as user name may not match for the same UID and + * user may end up exposing resources to other users unintentionally. + */ +struct io_uring_personality { + struct list_head list; + struct io_uring_personality_desc desc; + char uid_name[32]; + char euid_name[32]; + char suid_name[32]; + char fsuid_name[32]; + char gid_name[32]; + char egid_name[32]; + char sgid_name[32]; + char fsgid_name[32]; +}; + +struct io_uring_ctx { + struct io_uring_ctx *next; + ino_t inode; + u32 id; + u32 state; + union { + struct { + IoUringFileEntry iofe; + struct io_uring_map map; + + struct list_head buf_list; + struct list_head pers_list; + size_t nr_pers; + } dump; + struct { + void *data; + size_t sqe_bytes; + size_t cqe_bytes; + size_t sq_arr_bytes; + } restore; + }; +}; + +static struct io_uring_ctx *ctx_hash_table[IO_URING_HASH_TABLE_MAX]; + +static struct io_uring_ctx *alloc_ctx(void) +{ + struct io_uring_ctx *ctx; + + ctx = xzalloc(sizeof(*ctx)); + if (!ctx) + return NULL; + + INIT_LIST_HEAD(&ctx->dump.buf_list); + INIT_LIST_HEAD(&ctx->dump.pers_list); + + return ctx; +} + +static struct io_uring_ctx *lookup_ctx(ino_t inode) +{ + struct io_uring_ctx *ctx; + + ctx = ctx_hash_table[inode & IO_URING_HASH_TABLE_MASK]; + for (; ctx; ctx = ctx->next) { + if (ctx->inode == inode) + break; + } + + return ctx; +} + +static void insert_ctx(ino_t inode, struct io_uring_ctx *ctx) +{ + struct io_uring_ctx **slot; + + slot = &ctx_hash_table[inode & IO_URING_HASH_TABLE_MASK]; + ctx->next = *slot; + *slot = ctx; +} + +static uint64_t offset_to_state(uint64_t offset) +{ + switch (offset) { + case IORING_OFF_SQ_RING: + return CTX_F_SEEN_SQE; + case IORING_OFF_CQ_RING: + return CTX_F_SEEN_CQE; + case IORING_OFF_SQES: + return CTX_F_SEEN_SQE_ARR; + default: + return 0; + } +} + +static const char *offset_to_str(uint64_t offset) +{ + switch (offset) { + case IORING_OFF_SQ_RING: + return "IORING_OFF_SQ_RING"; + case IORING_OFF_CQ_RING: + return "IORING_OFF_CQ_RING"; + case IORING_OFF_SQES: + return "IORING_OFF_SQES"; + default: + return "Unknown"; + } +} + +int io_uring_push_buf(struct io_uring_ctx *ctx, unsigned int idx, long long unsigned int address, unsigned int len) +{ + struct io_uring_buf *buf; + + buf = xzalloc(sizeof(*buf)); + if (!buf) + return -ENOMEM; + + buf->idx = idx; + buf->address = address; + buf->len = len; + list_add_tail(&buf->list, &ctx->dump.buf_list); + + return 0; +} + +int io_uring_push_personality(struct io_uring_ctx *ctx, struct io_uring_personality_desc *desc) +{ + struct io_uring_personality *p; + struct io_uring_group_desc *g; + struct passwd *pwd; + struct group *grp; + int grps = 0; + + p = xzalloc(sizeof(*p)); + if (!p) + return -ENOMEM; + INIT_LIST_HEAD(&p->list); + + p->desc = *desc; + INIT_LIST_HEAD(&p->desc.group_list); + +#define X(ptr, sub) \ + pwd = getpwuid(desc->sub); \ + if (pwd) \ + strlcpy(ptr->sub##_name, pwd->pw_name, sizeof(ptr->sub##_name)); + X(p, uid); + X(p, euid); + X(p, suid); + X(p, fsuid); +#undef X +#define X(ptr, sub) \ + grp = getgrgid(desc->sub); \ + if (grp) \ + strlcpy(ptr->sub##_name, grp->gr_name, sizeof(ptr->sub##_name)); + X(p, gid); + X(p, egid); + X(p, sgid); + X(p, fsgid); +#undef X + + list_for_each_entry(g, &desc->group_list, list) { + grp = getgrgid(g->group); + if (pwd) + strlcpy(g->group_name, grp->gr_name, sizeof(g->group_name)); + grps++; + } + BUG_ON(grps != desc->nr_groups); + + /* Migrate prepared group list from local desc to personality object */ + list_splice(&desc->group_list, &p->desc.group_list); + + /* ... and append personality object to ctx personality list */ + list_add_tail(&p->list, &ctx->dump.pers_list); + ctx->dump.nr_pers++; + return 0; +} + +IoUringFileEntry *io_uring_get_iofe(struct io_uring_ctx *ctx) +{ + return &ctx->dump.iofe; +} + +/* + * TODO: + * Handle IORING_REGISTER_BUFFERS + * Handle IORING_REGISTER_FILES + * Handle IORING_REGISTER_EVENTFD_{ASYNC} + * + * Handle wq_fd registration + * * Compare in-kernel ctx->sq_data to associate with open fd + * Audit memory cleanup after error at various places + */ + +static int sys_io_uring_setup(unsigned int entries, struct io_uring_params *p) +{ + return (int)syscall(__NR_io_uring_setup, entries, p); +} + +/* XXX: We can expose timeout here to not block indefinitely when trying to sync + * io_uring fd during dump stage, in case forward progress depends on one + * of the stopped threads. + */ +static int sys_io_uring_enter(int ring_fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags) +{ + return (int)syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete, flags, NULL, 0); +} + +static int sys_io_uring_register(int ring_fd, unsigned int opcode, void *arg, unsigned int nr_args) +{ + return (int)syscall(__NR_io_uring_register, ring_fd, opcode, arg, nr_args); +} + +static int io_uring_restore_personality(int fd, IoUringPersonalityId *pers_id) +{ + struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {}; + struct cap_header hdr; + pid_t pid; + int ret; + + /* fork into a new child to manipulate credentials and register personality */ + pid = fork(); + if (pid) { + pid = waitpid(pid, &ret, 0); + if (pid < 0) + return -errno; + return -ret; + } else if (!pid) { + u32 cap[2] = { + pers_id->cap_eff & 0xffffffff00000000, + pers_id->cap_eff & 0x00000000ffffffff, + }; + size_t n_grps = 0, sz = 32; + struct passwd *pwd; + bool group = false; + struct group *grp; + gid_t *groups; + +#define X(c, m, x) \ + if (c) { \ + if (strcmp(c->m##_name, pers_id->x##_name)) \ + pr_warn("User name from image and system do not match for %s %d\n", group ? "GID" : "UID", \ + pers_id->x); \ + } else { \ + pr_warn("No user for %s %d on system\n", group ? "GID" : "UID", pers_id->x); \ + } + pwd = getpwuid(pers_id->uid); + X(pwd, pw, uid); + pwd = getpwuid(pers_id->euid); + X(pwd, pw, euid); + pwd = getpwuid(pers_id->suid); + X(pwd, pw, suid); + pwd = getpwuid(pers_id->fsuid); + X(pwd, pw, fsuid); + + group = true; + + grp = getgrgid(pers_id->gid); + X(grp, gr, gid); + grp = getgrgid(pers_id->egid); + X(grp, gr, egid); + grp = getgrgid(pers_id->sgid); + X(grp, gr, sgid); + grp = getgrgid(pers_id->fsgid); + X(grp, gr, fsgid); +#undef X + + ret = setresuid(pers_id->uid, pers_id->euid, pers_id->suid); + if (ret < 0) + goto end; + ret = setfsuid(pers_id->fsuid); + if (ret < 0) + goto end; + ret = setresgid(pers_id->gid, pers_id->euid, pers_id->suid); + if (ret < 0) + goto end; + ret = setfsgid(pers_id->fsgid); + if (ret < 0) + goto end; + + groups = xmalloc(sz * sizeof(*groups)); + if (!groups) { + errno = ENOMEM; + goto end; + } + + for (int i = 0; i < pers_id->n_group_id; i++) { + IoUringGroupId *gd = pers_id->group_id[i]; + struct group *grp; + gid_t *g; + + grp = getgrgid(gd->group); + if (!grp) + pr_warn("Group name not found for GID %d\n", gd->group); + if (strcmp(gd->group_name, grp->gr_name)) + pr_warn("Group name in image and on system do not match for GID %d\n", gd->group); + + if (sz <= n_grps) { + sz *= 2; + g = xrealloc(groups, sz * sizeof(*g)); + if (!g) { + xfree(groups); + errno = ENOMEM; + goto end; + } + groups = g; + } + groups[n_grps++] = gd->group; + } + + ret = setgroups(n_grps, groups); + xfree(groups); + if (ret < 0) { + errno = -ret; + goto end; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE); + + for (int i = 0; i < CR_CAP_SIZE; i++) + data[i].eff = cap[i]; + + ret = syscall(__NR_capset, &hdr, data); + if (ret < 0) { + errno = -ret; + goto end; + } + + ret = sys_io_uring_register(fd, __IORING_REGISTER_PERSONALITY, NULL, 0); + if (ret < 0) { + errno = -ret; + goto end; + } + + exit(0); + end: + exit(errno); + } else { + return -errno; + } + + return 0; +} + +int is_io_uring_link(char *link) +{ + return is_anon_link_type(link, "[io_uring]"); +} + +static void io_uring_submit_nop(struct io_uring_map *map, bool barrier) +{ + unsigned int tail, index; + + BUG_ON(!map); + + tail = atomic_load_acquire(map->sq_ring_tail); + index = tail & *map->sq_ring_mask; + map->sqe[index].opcode = IORING_OP_NOP; + if (barrier) + map->sqe[index].flags = IOSQE_IO_DRAIN; + map->sq_array[index] = index; + atomic_store_release(map->sq_ring_tail, tail + 1); +} + +static int io_uring_consume_n(struct io_uring_map *map, int n) +{ + unsigned int head; + int ret; + + BUG_ON(!map); + + head = *map->cqe_ring_head; + ret = map->cqe[head & *map->cqe_ring_mask].res; + atomic_store_release(map->cqe_ring_head, head + n); + + return ret; +} + +static void io_uring_consume_all(struct io_uring_map *map) +{ + BUG_ON(!map); + + (void)io_uring_consume_n(map, atomic_load_acquire(map->cqe_ring_tail) - *map->cqe_ring_head); +} + +static int map_io_uring_fd(int fd, struct io_uring_params *p, struct io_uring_map *map) +{ + int ret = 0; + + BUG_ON(!p); + BUG_ON(!map); + + /* XXX: Optimize using FEAT_SINGLE_MMAP */ + map->sq_len = p->sq_off.array + p->sq_entries * sizeof(unsigned int); + map->cqe_len = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); + map->sqe_len = p->sq_entries * sizeof(struct io_uring_sqe); + + map->sq_array = + mmap(NULL, map->sq_len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (map->sq_array == MAP_FAILED) { + ret = -errno; + pr_perror("Failed to mmap SQ array ring"); + goto end; + } + + map->cqe = mmap(NULL, map->cqe_len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); + if (map->cqe == MAP_FAILED) { + ret = -errno; + pr_perror("Failed to mmap CQE ring"); + goto end_sq_ptr; + } + + map->sq_ring_head = map->sq_array + p->sq_off.head; + map->sq_ring_tail = map->sq_array + p->sq_off.tail; + map->cqe_ring_head = (unsigned int *)map->cqe + p->cq_off.head; + map->cqe_ring_tail = (unsigned int *)map->cqe + p->cq_off.tail; + map->sq_ring_mask = map->sq_array + p->sq_off.ring_mask; + map->cqe_ring_mask = (unsigned int *)map->cqe + p->cq_off.ring_mask; + map->sq_array += p->sq_off.array; + + map->sqe = mmap(NULL, map->sqe_len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (map->sqe == MAP_FAILED) { + ret = -errno; + pr_perror("Failed to mmap SQE ring"); + goto end_cqe_ptr; + } + + return ret; + + munmap(map->sqe, map->sqe_len); +end_cqe_ptr: + munmap(map->cqe, map->cqe_len); +end_sq_ptr: + munmap(map->sq_array, map->sq_len); +end: + return ret; +} + +static void unmap_io_uring_fd(struct io_uring_map *map) +{ + BUG_ON(!map); + BUG_ON(!map->sqe); + BUG_ON(!map->cqe); + BUG_ON(!map->sq_array); + + munmap(map->sqe, map->sqe_len); + munmap(map->cqe, map->cqe_len); + munmap(map->sq_array, map->sq_len); +} + +int io_uring_synchronize_fd(int fd) +{ + struct io_uring_map map = {}; + struct io_uring_params p; + struct io_uring_ctx *ctx; + unsigned int rem; + struct stat st; + bool sq_poll; + int ret; + + if (fd < 0) + return fd; + + if (fstat(fd, &st)) + return -errno; + + ctx = lookup_ctx(st.st_ino); + if (!ctx) + return -ENOENT; + + assert("File Entry must be unitialized" && !(ctx->state & CTX_F_INIT_IOFE)); + /* Obtains sq_off.array, while the rest are offsets we can get from a + * io_uring_setup call. Also caches this in ctx so that we don't have to + * parse once again. + */ + if (parse_fdinfo(fd, FD_TYPES__IO_URING, ctx)) + return -EINVAL; + ctx->state |= CTX_F_INIT_IOFE; + return 0; + + sq_poll = ctx->dump.iofe.setup_flags & IORING_SETUP_SQPOLL; + + memset(&p, 0, sizeof(p)); + ret = sys_io_uring_setup(1, &p); + if (ret < 0) + return -errno; + close(ret); + + p.sq_off.array = ctx->dump.iofe.sq_off_array; + p.sq_entries = ctx->dump.iofe.sq_entries; + p.cq_entries = ctx->dump.iofe.cq_entries; + + ret = map_io_uring_fd(fd, &p, &map); + if (ret < 0) + return ret; + + /* Preserve head/tail and ring mask */ + ctx->dump.iofe.sq_head = atomic_load_acquire(map.sq_ring_head); + ctx->dump.iofe.sq_tail = *map.sq_ring_tail; + ctx->dump.iofe.cqe_head = *map.cqe_ring_head; + ctx->dump.iofe.sq_ring_mask = *map.sq_ring_mask; + + io_uring_consume_all(&map); + + rem = ctx->dump.iofe.sq_tail - ctx->dump.iofe.sq_head; + /* XXX: Add timeout to gracefully handle indefinite blocking */ + ret = sys_io_uring_enter(fd, rem, rem, IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP); + if (ret < 0) { + ret = -errno; + pr_perror("Failed to call io_uring_enter"); + } + + ctx->dump.iofe.cqe_tail = atomic_load_acquire(map.cqe_ring_tail); + if (sq_poll) + ctx->dump.iofe.sq_head = ctx->dump.iofe.sq_tail; + + ctx->dump.map = map; + return ret; +} + +static int replay_io_uring_data(int fd, struct io_uring_ctx *ctx, struct io_uring_params *p, IoUringFileEntry *iofe) +{ + unsigned int nop_count, cons_count; + struct io_uring_map map; + int ret = 0, flags = 0; + void *data; + + BUG_ON(!ctx); + BUG_ON(!p); + BUG_ON(!iofe); + BUG_ON(p->sq_entries != ctx->restore.sqe_bytes / sizeof(struct io_uring_sqe)); + BUG_ON(p->cq_entries != ctx->restore.cqe_bytes / sizeof(struct io_uring_cqe)); + BUG_ON(p->sq_entries != ctx->restore.sq_arr_bytes / sizeof(unsigned int)); + + /* To replay the data, we first need to advance head and tail to the + * values they were when the io_uring instance was dumped. At the ABI + * level the request and completion structure have same size for all + * operations, so filling IORING_OP_NOP operations and reaping them + * adjust the kernel's offsets, after which we overwrite the ring with + * data we dumped in the image. + */ + if (p->flags & IORING_SETUP_SQPOLL) + flags |= IORING_ENTER_SQ_WAKEUP; + + ret = map_io_uring_fd(fd, p, &map); + if (ret < 0) + return ret; + + nop_count = iofe->sq_head & iofe->sq_ring_mask; + cons_count = iofe->cqe_tail & iofe->cq_ring_mask; + + for (int i = 0; i < nop_count; i++) + io_uring_submit_nop(&map, false); + + ret = sys_io_uring_enter(fd, nop_count, nop_count, IORING_ENTER_GETEVENTS | flags); + if (ret < 0) { + pr_perror("Failed to call io_uring_enter"); + goto end; + } + + io_uring_consume_n(&map, cons_count); + + data = ctx->restore.data; + memcpy(map.sqe, data, ctx->restore.sqe_bytes); + data += ctx->restore.sqe_bytes; + memcpy(map.cqe, data, ctx->restore.cqe_bytes); + data += ctx->restore.cqe_bytes; + memcpy(map.sq_array, data, ctx->restore.sq_arr_bytes); + +end: + xfree(ctx->restore.data); + unmap_io_uring_fd(&map); + return ret; +} + +static int dump_one_io_uring_data(struct io_uring_ctx *ctx, IoUringFileEntry *iofe, int lfd, const struct fd_parms *p) +{ + IoUringDataEntry iode = IO_URING_DATA_ENTRY__INIT; + struct io_uring_map *map; + struct cr_img *img; + int ret; + + map = &ctx->dump.map; + + BUG_ON(!map->sqe); + BUG_ON(!map->cqe); + BUG_ON(!map->sq_array); + + img = img_from_set(glob_imgset, CR_FD_IO_URING_DATA); + BUG_ON(ctx->state & CTX_F_DONE_DATA); + + iode.id = ctx->inode; + iode.sqe_bytes = sizeof(struct io_uring_sqe) * ctx->dump.iofe.sq_entries; + iode.cqe_bytes = sizeof(struct io_uring_cqe) * ctx->dump.iofe.cq_entries; + iode.sq_arr_bytes = sizeof(unsigned int) * ctx->dump.iofe.sq_entries; + + ret = -1; + if (pb_write_one(img, &iode, PB_IO_URING_DATA)) + goto end; + + /* Layout |SQE|CQE|SQARR| */ + if (write(img_raw_fd(img), map->sqe, iode.sqe_bytes) != iode.sqe_bytes) + goto end; + if (write(img_raw_fd(img), map->cqe, iode.cqe_bytes) != iode.cqe_bytes) + goto end; + if (write(img_raw_fd(img), map->sq_array, iode.sq_arr_bytes) != iode.sq_arr_bytes) + goto end; + + ret = 0; + ctx->state |= CTX_F_DONE_DATA; +end: + unmap_io_uring_fd(map); + return ret; +} + +static int dump_one_io_uring(int lfd, u32 id, const struct fd_parms *p) +{ + IoUringFileEntry iofe = IO_URING_FILE_ENTRY__INIT; + struct io_uring_personality *per_i, *ptmp; + struct io_uring_buf *buf_i, *btmp; + FileEntry fe = FILE_ENTRY__INIT; + struct io_uring_ctx *ctx; + int i = 0, j = 0; + + ctx = lookup_ctx(p->stat.st_ino); + if (!ctx) + return -ENOENT; + + BUG_ON(!(ctx->state & CTX_F_INIT_IOFE)); + BUG_ON(ctx->state & CTX_F_DONE_FILE); + + iofe.id = ctx->id = id; + iofe.inode = ctx->inode; + iofe.flags = p->flags; + iofe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__IO_URING; + fe.id = iofe.id; + fe.io_uring = &iofe; + + list_for_each_entry_safe(buf_i, btmp, &ctx->dump.buf_list, list) { + /* XXX: match struct page address for buf_i->idx from eBPF + * iterator output + */ + xfree(buf_i); + } + + BUG_ON(!list_empty(&ctx->dump.pers_list) && !ctx->dump.nr_pers); + ctx->dump.iofe.n_pers_id = ctx->dump.nr_pers; + ctx->dump.iofe.pers_id = xzalloc(pb_repeated_size(&ctx->dump.iofe, pers_id)); + if (!ctx->dump.iofe.pers_id) + return -ENOMEM; + + list_for_each_entry_safe(per_i, ptmp, &ctx->dump.pers_list, list) { + struct io_uring_group_desc *grp_i, *gtmp; + IoUringPersonalityId *pers_id; + + BUG_ON(i + 1 != per_i->desc.id); + ctx->dump.iofe.pers_id[i] = xzalloc(sizeof(*ctx->dump.iofe.pers_id[i])); + if (!ctx->dump.iofe.pers_id[i]) + return -ENOMEM; + + pers_id = ctx->dump.iofe.pers_id[i]; + +#define X(x) pers_id->x = per_i->desc.x; + X(uid); + X(euid); + X(suid); + X(fsuid); + X(gid); + X(egid); + X(sgid); + X(fsgid); +#undef X + +#define X(x) \ + pers_id->x##_name = xstrdup(per_i->x##_name); \ + if (!pers_id->x##_name) \ + return -ENOMEM; + X(uid); + X(euid); + X(suid); + X(fsuid); + X(gid); + X(egid); + X(sgid); + X(fsgid); +#undef X + memcpy(&pers_id->cap_eff, per_i->desc.cap_eff, sizeof(per_i->desc.cap_eff)); + BUG_ON(!list_empty(&per_i->desc.group_list) && !per_i->desc.nr_groups); + pers_id->n_group_id = per_i->desc.nr_groups; + pers_id->group_id = xzalloc(pb_repeated_size(pers_id, group_id)); + if (!pers_id->group_id) + return -ENOMEM; + /* Now, iterate over group list for personality, and dump each + * group ID and group name + */ + j = 0; + list_for_each_entry_safe(grp_i, gtmp, &per_i->desc.group_list, list) { + pers_id->group_id[j] = xzalloc(sizeof(*pers_id->group_id[j])); + if (!pers_id->group_id[j]) + return -ENOMEM; + pers_id->group_id[j]->group = grp_i->group; + pers_id->group_id[j]->group_name = xstrdup(grp_i->group_name); + if (!pers_id->group_id[j]->group_name) + return -ENOMEM; + j++; + xfree(grp_i); + } + BUG_ON(j != per_i->desc.nr_groups); + i++; + xfree(per_i); + } + BUG_ON(i != ctx->dump.nr_pers); + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + return -1; + ctx->state |= CTX_F_DONE_FILE; + + return dump_one_io_uring_data(ctx, &iofe, lfd, p); +} + +const struct fdtype_ops io_uring_dump_ops = { + .type = FD_TYPES__IO_URING, + .dump = dump_one_io_uring, +}; + +static int open_io_uring_desc(struct file_desc *d, int *new_fd) +{ + struct __io_uring_restriction res[4]; + struct io_uring_file_info *iofi; + struct io_uring_ctx *ctx; + struct io_uring_params p; + IoUringFileEntry *iofe; + int fd, ret = -1; + + iofi = container_of(d, struct io_uring_file_info, d); + iofe = iofi->iofe; + + /* XXX: when we handle IORING_REGISTER_FILES, and wq_fd registration, + * handle post_open processing here to re-register files... + * + * For wq_fd, there is a parent io_uring fd that will be restored first + * (without any other dependencies on io_uring instances). Cycles cannot + * be created as io_uring won't allow IORING_REGISTER_FILES for another + * io_uring, so we cannot deadlock, and wq_fd registration won't be + * circular either. wq_fd is determined using ctx->sq_data matching in + * eBPF iteration. + */ + ctx = lookup_ctx(iofe->id); + if (!ctx) + return -ENOENT; + + memset(&p, 0, sizeof(p)); + p.sq_thread_cpu = iofe->sq_thread_cpu; + p.sq_thread_idle = iofe->sq_thread_idle; + p.cq_entries = iofe->cq_entries; + p.flags = iofe->setup_flags | IORING_SETUP_CQSIZE; + + if (iofe->restrictions) + p.flags |= IORING_SETUP_R_DISABLED; + + fd = sys_io_uring_setup(iofe->sq_entries, &p); + if (fd < 0) + return -errno; + + for (int i = 0; i < iofe->n_pers_id; i++) { + IoUringPersonalityId *pers_id = iofe->pers_id[i]; + + ret = io_uring_restore_personality(fd, pers_id); + if (ret < 0) + goto end; + } + + if (iofe->restrictions) { + int nr = 0; + + if (iofe->reg_op) { + res[nr].opcode = __IORING_RESTRICTION_REGISTER_OP; + res[nr++].register_op = iofe->reg_op; + } + + if (iofe->sqe_op) { + res[nr].opcode = __IORING_RESTRICTION_SQE_OP; + res[nr++].sqe_op = iofe->sqe_op; + } + + if (iofe->sqe_flags_allowed) { + res[nr].opcode = __IORING_RESTRICTION_SQE_FLAGS_ALLOWED; + res[nr++].sqe_flags = iofe->sqe_flags_allowed; + } + + if (iofe->sqe_flags_required) { + res[nr].opcode = __IORING_RESTRICTION_SQE_FLAGS_REQUIRED; + res[nr++].sqe_flags = iofe->sqe_flags_required; + } + + BUG_ON(nr >= ARRAY_SIZE(res)); + if (nr) { + ret = sys_io_uring_register(fd, __IORING_REGISTER_RESTRICTIONS, res, nr); + if (ret < 0) + goto end; + } + + ret = sys_io_uring_register(fd, __IORING_REGISTER_ENABLE_RINGS, NULL, 0); + if (ret < 0) + goto end; + } + + if ((p.flags & IORING_SETUP_SQPOLL) && !iofe->nr_user_files && !(p.features & IORING_FEAT_SQPOLL_NONFIXED)) { + ret = -ENOTSUP; + pr_err("Dumped io_uring instance %#08x has IORING_SETUP_SQPOLL flag, but no registered files,\n" + "and system does not support SQPOLL in this mode, as IORING_FEAT_SQPOLL_NONFIXED \n" + "feature is missing\n", + iofe->id); + goto end; + } + + if (rst_file_params(fd, iofe->fown, iofi->iofe->flags)) { + pr_perror("Can't restore file params on io_uring %#08x", iofe->id); + goto end; + } + + ret = replay_io_uring_data(fd, ctx, &p, iofe); + if (ret < 0) + goto end; + + *new_fd = fd; + + return 0; +end: + close(fd); + return ret; +} + +static struct file_desc_ops io_uring_desc_ops = { + .type = FD_TYPES__IO_URING, + .open = open_io_uring_desc, +}; + +static int collect_one_io_uring(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct io_uring_file_info *iofi = o; + struct io_uring_ctx *ctx; + + ctx = alloc_ctx(); + if (!ctx) + return -ENOMEM; + + iofi->iofe = pb_msg(base, IoUringFileEntry); + ctx->inode = iofi->iofe->id; + insert_ctx(iofi->iofe->id, ctx); + return file_desc_add(&iofi->d, iofi->iofe->id, &io_uring_desc_ops); +} + +struct collect_image_info io_uring_cinfo = { + .fd_type = CR_FD_IO_URING_FILE, + .pb_type = PB_IO_URING_FILE, + .priv_size = sizeof(struct io_uring_file_info), + .collect = collect_one_io_uring, +}; + +static int collect_one_io_uring_data(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct io_uring_data_info *iodi = o; + struct io_uring_ctx *ctx; + size_t bytes; + + iodi->iode = pb_msg(base, IoUringDataEntry); + + ctx = lookup_ctx(iodi->iode->id); + if (!ctx) { + /* Should have been inserted by file collect stage */ + pr_err("Failed to failed io_uring restore ctx for id %#08lx\n", (unsigned long)iodi->iode->id); + return -ENOENT; + } + + bytes = iodi->iode->sqe_bytes + iodi->iode->cqe_bytes + iodi->iode->sq_arr_bytes; + ctx->restore.data = xmalloc(bytes); + if (!ctx->restore.data) + return -ENOMEM; + + return read_img_buf(i, ctx->restore.data, bytes); +} + +struct collect_image_info io_uring_data_cinfo = { + .fd_type = CR_FD_IO_URING_DATA, + .pb_type = PB_IO_URING_DATA, + .priv_size = sizeof(struct io_uring_data_info), + .collect = collect_one_io_uring_data, +}; + +static int open_io_uring_map(int pid, struct vma_area *vma) +{ + struct fdinfo_list_entry *fle; + VmaEntry *vme = vma->e; + struct file_desc *fd; + + fd = find_file_desc_raw(FD_TYPES__IO_URING, vme->shmid); + if (!fd) + return -1; + + list_for_each_entry(fle, &fd->fd_info_head, desc_list) { + if (fle->pid == pid) { + int fd; + + fd = dup(fle->fe->fd); + if (fd < 0) + return -errno; + + vme->fd = fd; + return 0; + } + } + + return -ENOENT; +} + +int collect_io_uring_map(struct vma_area *vma) +{ + vma->vm_open = open_io_uring_map; + return 0; +} + +int dump_io_uring_map(struct vma_area *vma) +{ + struct io_uring_ctx *ctx; + + ctx = lookup_ctx(vma->io_uring_id); + if (!ctx) + return -ENOENT; + + if (!(ctx->state & CTX_F_DONE_ALL)) { + pr_err("Mapping(s) found for io_uring but no fd open, cannot dump " + "io_uring instance without access to io_uring fd corresponding " + "to the mapping\n"); + return -ENOTSUP; + } + + vma->e->shmid = ctx->inode; + return 0; +} + +int add_one_io_uring_mapping(uint64_t offset, ino_t inode) +{ + struct io_uring_ctx *ctx; + uint64_t flag; + + pr_debug("Processing for io_uring mapping at offset=%s\n", offset_to_str(offset)); + flag = offset_to_state(offset); + if (!flag) { + pr_err("Invalid offset of mapping offset=%" PRIu64 "\n", offset); + return -EINVAL; + } + + ctx = lookup_ctx(inode); + if (!ctx) { + pr_debug("No io_uring ctx associated with inode=%lu, creating one...\n", (unsigned long)inode); + + ctx = alloc_ctx(); + if (!ctx) + return -ENOMEM; + + ctx->inode = inode; + insert_ctx(ctx->inode, ctx); + } + + ctx->state |= flag; + return 0; +} diff --git a/criu/irmap.c b/criu/irmap.c index 09570c5931..7b9d77bc1f 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -160,8 +160,8 @@ static int irmap_update_dir(struct irmap *t) k = &t->kids[nr - 1]; - k->kids = NULL; /* for xrealloc above */ - k->ino = 0; /* for irmap_update_stat */ + k->kids = NULL; /* for xrealloc above */ + k->ino = 0; /* for irmap_update_stat */ k->nr_kids = -1; /* for irmap_update_dir */ k->path = xsprintf("%s/%s", t->path, de->d_name); if (!k->path) diff --git a/criu/kerndat.c b/criu/kerndat.c index 0e88ba43e2..9f6a6ec428 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -816,6 +816,35 @@ static int kerndat_x86_has_ptrace_fpu_xsave_bug(void) return 0; } +int kerndat_sockopt_buf_lock(void) +{ + int exit_code = -1; + socklen_t len; + u32 buf_lock; + int sock; + + sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0) { + pr_perror("Unable to create a socket"); + return -1; + } + + len = sizeof(buf_lock); + if (getsockopt(sock, SOL_SOCKET, SO_BUF_LOCK, &buf_lock, &len)) { + if (errno != ENOPROTOOPT) { + pr_perror("Unable to get SO_BUF_LOCK with getsockopt"); + goto err; + } + kdat.has_sockopt_buf_lock = false; + } else + kdat.has_sockopt_buf_lock = true; + + exit_code = 0; +err: + close(sock); + return exit_code; +} + #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" #define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" @@ -1359,6 +1388,10 @@ int kerndat_init(void) pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_sockopt_buf_lock()) { + pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/mem.c b/criu/mem.c index ca74bfbb65..7a1f355521 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -31,6 +31,9 @@ #include "prctl.h" #include "compel/infect-util.h" #include "pidfd-store.h" +#include "compel/plugins/std/syscall-codes.h" +#include "common/scm.h" +#include "io_uring.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" @@ -125,6 +128,8 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; + if (vma_entry_is(vmae, VMA_AREA_IO_URING)) + return false; if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) return true; @@ -704,6 +709,8 @@ int prepare_mm_pid(struct pstree_item *i) ret = collect_filemap(vma); else if (vma_area_is(vma, VMA_AREA_SOCKET)) ret = collect_socket_map(vma); + else if (vma_area_is(vma, VMA_AREA_IO_URING)) + ret = collect_io_uring_map(vma); else ret = 0; if (ret) diff --git a/criu/mount.c b/criu/mount.c index ec31f02c23..4b57ac7034 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -27,6 +27,7 @@ #include "external.h" #include "clone-noasan.h" #include "fdstore.h" +#include "rst-malloc.h" #include "images/mnt.pb-c.h" @@ -1017,39 +1018,21 @@ int mnt_is_dir(struct mount_info *pm) return 0; } -/* - * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. - * If mnt_fd is -1, the mountpoint will be opened by this function. - */ -int __open_mountpoint(struct mount_info *pm, int mnt_fd) +int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo) { struct stat st; - int dev; + unsigned int dev; int ret; - if (mnt_fd == -1) { - int mntns_root; - - mntns_root = mntns_get_root_fd(pm->nsid); - if (mntns_root < 0) - return -1; - - mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); - if (mnt_fd < 0) { - pr_perror("Can't open %s", pm->ns_mountpoint); - return -1; - } - } - ret = fstat(mnt_fd, &st); if (ret < 0) { pr_perror("fstat(%s) failed", pm->ns_mountpoint); - goto err; + return -1; } if (pm->s_dev_rt == MOUNT_INVALID_DEV) { pr_err("Resolving over invalid device for %#x %s %s\n", pm->s_dev, pm->fstype->name, pm->ns_mountpoint); - goto err; + return -1; } dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); @@ -1060,15 +1043,51 @@ int __open_mountpoint(struct mount_info *pm, int mnt_fd) * allocates new device ID). */ if (dev != pm->s_dev_rt) { + /* + * For btrfs device numbers in stat and mountinfo can be + * different, fallback to get_sdev_from_fd to get right dev. + */ + if (!strcmp(pm->fstype->name, "btrfs") && !get_sdev_from_fd(mnt_fd, &dev, parse_mountinfo) && + dev == pm->s_dev_rt) + return 0; + pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, pm->fstype->name, pm->ns_mountpoint); - goto err; + return -1; + } + + return 0; +} + +int check_mountpoint_fd(struct mount_info *pm, int mnt_fd) +{ + return __check_mountpoint_fd(pm, mnt_fd, false); +} + +/* + * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. + * If mnt_fd is -1, the mountpoint will be opened by this function. + */ +int __open_mountpoint(struct mount_info *pm) +{ + int mntns_root, mnt_fd; + + mntns_root = mntns_get_root_fd(pm->nsid); + if (mntns_root < 0) + return -1; + + mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); + if (mnt_fd < 0) { + pr_perror("Can't open %s", pm->ns_mountpoint); + return -1; + } + + if (check_mountpoint_fd(pm, mnt_fd)) { + close(mnt_fd); + return -1; } return mnt_fd; -err: - close(mnt_fd); - return -1; } int open_mount(unsigned int s_dev) @@ -1079,7 +1098,7 @@ int open_mount(unsigned int s_dev) if (!m) return -ENOENT; - return __open_mountpoint(m, -1); + return __open_mountpoint(m); } /* Bind-mount a mount point in a temporary place without children */ @@ -1109,12 +1128,34 @@ static int get_clean_fd(struct mount_info *mi) char *mnt_path = NULL; char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; char mnt_path_root[] = "/cr-tmpfs.XXXXXX"; + int fd; mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root); if (!mnt_path) return -1; - return open_detach_mount(mnt_path); + fd = open(mnt_path, O_RDONLY | O_DIRECTORY, 0); + if (fd < 0) { + pr_perror("Can't open directory %s", mnt_path); + } else { + if (__check_mountpoint_fd(mi, fd, true)) + goto err_close; + } + + if (umount2(mnt_path, MNT_DETACH)) { + pr_perror("Can't detach mount %s", mnt_path); + goto err_close; + } + + if (rmdir(mnt_path)) { + pr_perror("Can't remove tmp dir %s", mnt_path); + goto err_close; + } + + return fd; +err_close: + close_safe(&fd); + return -1; } /* @@ -1332,6 +1373,11 @@ int ns_open_mountpoint(void *arg) goto err; } + if (__check_mountpoint_fd(mi, *fd, true)) { + close(*fd); + goto err; + } + return 0; err: return 1; @@ -1343,7 +1389,7 @@ int open_mountpoint(struct mount_info *pm) /* No overmounts and children - the entire mount is visible */ if (list_empty(&pm->children) && !mnt_is_overmounted(pm)) - return __open_mountpoint(pm, -1); + return __open_mountpoint(pm); pr_info("Mount is not fully visible %s\n", pm->mountpoint); @@ -1406,7 +1452,7 @@ int open_mountpoint(struct mount_info *pm) goto err; } - return __open_mountpoint(pm, fd); + return fd < 0 ? __open_mountpoint(pm) : fd; err: if (ns_old >= 0) /* coverity[check_return] */ @@ -1415,7 +1461,8 @@ int open_mountpoint(struct mount_info *pm) return -1; } -static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev) +static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev, + bool rst) { struct mount_info *mi, *t, *parent; bool add_slash = false; @@ -1434,7 +1481,7 @@ static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsnam } } - mi = mnt_entry_alloc(); + mi = mnt_entry_alloc(rst); if (!mi) return -1; @@ -2561,7 +2608,7 @@ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *mi; /* child is remaped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; @@ -2723,7 +2770,7 @@ static int cr_pivot_root(char *root) return exit_code; } -struct mount_info *mnt_entry_alloc() +struct mount_info *mnt_entry_alloc(bool rst) { struct mount_info *new; @@ -2734,6 +2781,13 @@ struct mount_info *mnt_entry_alloc() new = xzalloc(sizeof(struct mount_info)); if (new) { + if (rst) { + new->remounted_rw = shmalloc(sizeof(int)); + if (!new->remounted_rw) { + xfree(new); + return NULL; + } + } new->fd = -1; new->is_overmounted = -1; INIT_LIST_HEAD(&new->children); @@ -2956,7 +3010,7 @@ static int collect_mnt_from_image(struct mount_info **head, struct mount_info ** if (ret <= 0) break; - pm = mnt_entry_alloc(); + pm = mnt_entry_alloc(true); if (!pm) goto err; @@ -3234,7 +3288,7 @@ static int populate_mnt_ns(void) { int ret; - root_yard_mp = mnt_entry_alloc(); + root_yard_mp = mnt_entry_alloc(true); if (!root_yard_mp) return -1; @@ -3247,7 +3301,7 @@ static int populate_mnt_ns(void) #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* Add to mount tree. Generic code will mount it later */ - ret = add_cr_time_mount(root_yard_mp, "binfmt_misc", BINFMT_MISC_HOME, 0); + ret = add_cr_time_mount(root_yard_mp, "binfmt_misc", BINFMT_MISC_HOME, 0, true); if (ret) return -1; } @@ -3697,7 +3751,7 @@ int collect_mnt_namespaces(bool for_dump) ret = -1; goto err; } else if (ret > 0 && add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, - s_dev) < 0) { + s_dev, false) < 0) { ret = -1; goto err; } @@ -3838,7 +3892,10 @@ int try_remount_writable(struct mount_info *mi, bool ns) if (!ns) remounted = REMOUNTED_RW_SERVICE; - if (mi->flags & MS_RDONLY && !(mi->remounted_rw & remounted)) { + /* All mounts in mntinfo list should have it on restore */ + BUG_ON(mi->remounted_rw == NULL); + + if (mi->flags & MS_RDONLY && !(*mi->remounted_rw & remounted)) { if (mnt_is_overmounted(mi)) { pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id); return -1; @@ -3861,7 +3918,7 @@ int try_remount_writable(struct mount_info *mi, bool ns) if (call_helper_process(ns_remount_writable, mi)) return -1; } - mi->remounted_rw |= remounted; + *mi->remounted_rw |= remounted; } return 0; @@ -3876,7 +3933,7 @@ static int __remount_readonly_mounts(struct ns_id *ns) if (ns && mi->nsid != ns) continue; - if (!(mi->remounted_rw && REMOUNTED_RW)) + if (!(*mi->remounted_rw && REMOUNTED_RW)) continue; /* diff --git a/criu/namespaces.c b/criu/namespaces.c index 7fa58682b8..c36e631cfd 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1330,11 +1330,6 @@ static int usernsd(int sk) unsc_msg_pid_fd(&um, &pid, &fd); pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); - if (fd < 0 && flags & UNS_FDOUT) { - pr_err("uns: bad flags/fd %p %d %x\n", call, fd, flags); - BUG(); - } - /* * Caller has sent us bare address of the routine it * wants to call. Since the caller is fork()-ed from the diff --git a/criu/net.c b/criu/net.c index 7b45f0633f..50655559d7 100644 --- a/criu/net.c +++ b/criu/net.c @@ -1172,7 +1172,7 @@ struct newlink_req { * request. */ struct newlink_extras { - int link; /* IFLA_LINK */ + int link; /* IFLA_LINK */ int target_netns; /* IFLA_NET_NS_FD */ }; @@ -1744,7 +1744,7 @@ static int __restore_link(struct ns_id *ns, struct net_link *link, int nlsk) switch (nde->type) { case ND_TYPE__LOOPBACK: /* fallthrough */ - case ND_TYPE__EXTLINK: /* see comment in images/netdev.proto */ + case ND_TYPE__EXTLINK: /* see comment in images/netdev.proto */ return restore_link_parms(link, nlsk); case ND_TYPE__VENET: return restore_one_link(ns, link, nlsk, venet_link_info, NULL); @@ -2250,12 +2250,12 @@ static int restore_ip_dump(int type, int pid, char *cmd) sockfd = img_raw_fd(img); if (sockfd < 0) { pr_err("Getting raw FD failed\n"); - return -1; + goto out_image; } tmp_file = tmpfile(); if (!tmp_file) { pr_perror("Failed to open tmpfile"); - return -1; + goto out_image; } while ((n = read(sockfd, buf, 1024)) > 0) { @@ -2264,25 +2264,34 @@ static int restore_ip_dump(int type, int pid, char *cmd) pr_perror("Failed to write to tmpfile " "[written: %d; total: %d]", written, n); - goto close; + goto out_tmp_file; } } if (fseek(tmp_file, 0, SEEK_SET)) { pr_perror("Failed to set file position to beginning of tmpfile"); - goto close; + goto out_tmp_file; } - if (img) { - ret = run_ip_tool(cmd, "restore", NULL, NULL, fileno(tmp_file), -1, 0); - close_image(img); + if (type == CR_FD_RULE) { + /* + * Delete 3 default rules to prevent duplicates. See kernel's + * function fib_default_rules_init() for the details. + */ + run_ip_tool("rule", "flush", NULL, NULL, -1, -1, 0); + run_ip_tool("rule", "delete", "table", "local", -1, -1, 0); } -close: + ret = run_ip_tool(cmd, "restore", NULL, NULL, fileno(tmp_file), -1, 0); + +out_tmp_file: if (fclose(tmp_file)) { pr_perror("Failed to close tmpfile"); } +out_image: + close_image(img); + return ret; } @@ -2304,31 +2313,7 @@ static inline int restore_route(int pid) static inline int restore_rule(int pid) { - struct cr_img *img; - int ret = 0; - - img = open_image(CR_FD_RULE, O_RSTR, pid); - if (!img) { - ret = -1; - goto out; - } - - if (empty_image(img)) - goto close; - - /* - * Delete 3 default rules to prevent duplicates. See kernel's - * function fib_default_rules_init() for the details. - */ - run_ip_tool("rule", "flush", NULL, NULL, -1, -1, 0); - run_ip_tool("rule", "delete", "table", "local", -1, -1, 0); - - if (restore_ip_dump(CR_FD_RULE, pid, "rule")) - ret = -1; -close: - close_image(img); -out: - return ret; + return restore_ip_dump(CR_FD_RULE, pid, "rule"); } /* diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 9adf2c8b22..60c793009f 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -50,8 +50,8 @@ static void psi2iovec(struct page_server_iov *ps, struct iovec *iov) #define PS_IOV_ADD_F 6 #define PS_IOV_GET 7 -#define PS_IOV_FLUSH 0x1023 -#define PS_IOV_FLUSH_N_CLOSE 0x1024 +#define PS_IOV_CLOSE 0x1023 +#define PS_IOV_FORCE_CLOSE 0x1024 #define PS_CMD_BITS 16 #define PS_CMD_MASK ((1 << PS_CMD_BITS) - 1) @@ -1223,8 +1223,8 @@ static int page_server_serve(int sk) ret = page_server_add(sk, &pi, flags); break; } - case PS_IOV_FLUSH: - case PS_IOV_FLUSH_N_CLOSE: { + case PS_IOV_CLOSE: + case PS_IOV_FORCE_CLOSE: { int32_t status = 0; ret = 0; @@ -1250,7 +1250,9 @@ static int page_server_serve(int sk) break; } - if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE)) + if (ret) + break; + if (pi.cmd == PS_IOV_CLOSE || pi.cmd == PS_IOV_FORCE_CLOSE) break; } @@ -1259,6 +1261,8 @@ static int page_server_serve(int sk) ret = -1; } + tls_terminate_session(ret != 0); + if (ret == 0 && opts.ps_socket == -1) { char c; @@ -1272,7 +1276,6 @@ static int page_server_serve(int sk) } } - tls_terminate_session(); page_server_close(); pr_info("Session over\n"); @@ -1490,9 +1493,9 @@ int disconnect_from_page_server(void) * the parent process) so we must order the * page-server to terminate itself. */ - pi.cmd = PS_IOV_FLUSH_N_CLOSE; + pi.cmd = PS_IOV_FORCE_CLOSE; else - pi.cmd = PS_IOV_FLUSH; + pi.cmd = PS_IOV_CLOSE; if (send_psi(page_server_sk, &pi)) goto out; @@ -1504,7 +1507,7 @@ int disconnect_from_page_server(void) ret = 0; out: - tls_terminate_session(); + tls_terminate_session(ret != 0); close_safe(&page_server_sk); return ret ?: status; diff --git a/criu/pagemap.c b/criu/pagemap.c index 77e519dd1f..83f69bba37 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -30,10 +30,10 @@ * One "job" for the preadv() syscall in pagemap.c */ struct page_read_iov { - off_t from; /* offset in pi file where to start reading from */ - off_t end; /* the end of the read == sum to.iov_len -s */ + off_t from; /* offset in pi file where to start reading from */ + off_t end; /* the end of the read == sum to.iov_len -s */ struct iovec *to; /* destination iovs */ - unsigned int nr; /* their number */ + unsigned int nr; /* their number */ struct list_head l; }; @@ -535,7 +535,6 @@ static int process_async_reads(struct page_read *pr) fd = img_raw_fd(pr->pi); list_for_each_entry_safe(piov, n, &pr->async, l) { ssize_t ret; - off_t start = piov->from; struct iovec *iovs = piov->to; pr_debug("Read piov iovs %d, from %ju, len %ju, first %p:%zu\n", piov->nr, piov->from, @@ -554,13 +553,16 @@ static int process_async_reads(struct page_read *pr) } } - if (ret != piov->end - piov->from) { - if (ret < 0) { - pr_err("Can't read async pr bytes (%zd / %ju read, %ju off, %d iovs)\n", ret, - piov->end - piov->from, piov->from, piov->nr); - return -1; - } + if (ret < 0) { + pr_err("Can't read async pr bytes (%zd / %ju read, %ju off, %d iovs)\n", ret, + piov->end - piov->from, piov->from, piov->nr); + return -1; + } + if (opts.auto_dedup && punch_hole(pr, piov->from, ret, false)) + return -1; + + if (ret != piov->end - piov->from) { /* * The preadv() can return less than requested. It's * valid and doesn't mean error or EOF. We should advance @@ -574,9 +576,6 @@ static int process_async_reads(struct page_read *pr) goto more; } - if (opts.auto_dedup && punch_hole(pr, start, ret, false)) - return -1; - BUG_ON(pr->io_complete); /* FIXME -- implement once needed */ list_del(&piov->l); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 4304691bbe..0051452e47 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1013,7 +1013,7 @@ static int timerfd_arm(struct task_restore_args *args) t->val.it_value.tv_sec += (time_t)ts.tv_sec; - pr_debug("Adjust id %#x it_value(%llu, %llu) -> it_value(%llu, %llu)\n", t->id, + pr_debug("Adjust id %x it_value(%llu, %llu) -> it_value(%llu, %llu)\n", t->id, (unsigned long long)ts.tv_sec, (unsigned long long)ts.tv_nsec, (unsigned long long)t->val.it_value.tv_sec, (unsigned long long)t->val.it_value.tv_nsec); diff --git a/criu/plugin.c b/criu/plugin.c index 3fe03c7cd7..f3fea28566 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -54,6 +54,9 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(DUMP_EXT_MOUNT, "cr_plugin_dump_ext_mount"); __assign_hook(RESTORE_EXT_MOUNT, "cr_plugin_restore_ext_mount"); __assign_hook(DUMP_EXT_LINK, "cr_plugin_dump_ext_link"); + __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); + __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); + __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); #undef __assign_hook diff --git a/criu/proc_parse.c b/criu/proc_parse.c index f3491e7817..5c593f426e 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "types.h" @@ -41,10 +42,12 @@ #include "path.h" #include "fault-injection.h" #include "memfd.h" +#include "io_uring.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "plugin.h" #include @@ -75,7 +78,8 @@ static char *buf = __buf.buf; * This is how AIO ring buffers look like in proc */ -#define AIO_FNAME "/[aio]" +#define AIO_FNAME "/[aio]" +#define IO_URING_FNAME "anon_inode:[io_uring]" /* check the @line starts with "%lx-%lx" format */ static bool __is_vma_range_fmt(char *line) @@ -103,6 +107,19 @@ bool is_vma_range_fmt(char *line) return __is_vma_range_fmt(line); } +bool handle_vma_plugin(int *fd, struct stat *stat) +{ + int ret; + + ret = run_plugins(HANDLE_DEVICE_VMA, *fd, stat); + if (ret < 0) { + pr_perror("handle_device_vma plugin failed"); + return false; + } + + return true; +} + static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { char *tok; @@ -171,7 +188,8 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_IO_URING) && !vma_area_is(vma_area, VMA_AREA_VVAR) && + !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) @@ -188,6 +206,7 @@ struct vma_file_info { int dev_min; unsigned long ino; struct vma_area *vma; + bool has_device_plugin; }; static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b) @@ -373,14 +392,20 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, st /* * If vfi is equal (!) and negative @vm_file_fd -- - * we have nothing to borrow for sure. + * we have nothing to borrow for sure, unless it's io_uring */ - if (*vm_file_fd < 0) + if (*vm_file_fd < 0 && !vma_area_is(prev, VMA_AREA_IO_URING)) return 0; pr_debug("vma %" PRIx64 " borrows vfi from previous %" PRIx64 "\n", vma->e->start, prev->e->start); - if (prev->e->status & VMA_AREA_SOCKET) + if (prev->e->status & VMA_AREA_SOCKET) { vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; + } else if (prev->e->status & VMA_AREA_IO_URING) { + vma->e->status |= VMA_AREA_IO_URING | VMA_AREA_REGULAR; + vma->io_uring_id = prev->io_uring_id; + /* Add page to io_uring ctx */ + add_one_io_uring_mapping(vma->e->pgoff, vma->io_uring_id); + } /* * FIXME -- in theory there can be vmas that have @@ -437,6 +462,16 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, st return 0; } + if (!strncmp(fname, IO_URING_FNAME, sizeof(IO_URING_FNAME) - 1)) { + pr_debug("Marking VMA as IO_URING | REGULAR for inode %lu\n", + (unsigned long)buf.st_ino); + vma->io_uring_id = buf.st_ino; + vma->e->status |= VMA_AREA_IO_URING | VMA_AREA_REGULAR; + /* Add page to io_uring ctx */ + add_one_io_uring_mapping(vma->e->pgoff, vma->io_uring_id); + return 0; + } + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); return -1; } @@ -577,11 +612,17 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat } else if (*vm_file_fd >= 0) { struct stat *st_buf = vma_area->vmst; - if (S_ISREG(st_buf->st_mode)) + if (S_ISREG(st_buf->st_mode)) { /* regular file mapping -- supported */; - else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) + pr_debug("Found regular file mapping, OK\n"); + } else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) { /* devzero mapping -- also makes sense */; - else { + pr_debug("Found devzero mapping, OK\n"); + } else if (handle_vma_plugin(vm_file_fd, st_buf)) { + pr_info("Found device file mapping, plugin is available\n"); + vfi->has_device_plugin = true; + } else { + /* non-regular mapping with no supporting plugin */ pr_err("Can't handle non-regular mapping on %d's map %" PRIx64 "\n", pid, vma_area->e->start); goto err; } @@ -616,6 +657,11 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat */ if (vma_area->mnt_id != -1 && get_fd_mntid(*vm_file_fd, &vma_area->mnt_id)) return -1; + } else if (vma_area->e->status & VMA_AREA_IO_URING) { + if (vma_area->e->flags & MAP_PRIVATE) + vma_area->e->status |= VMA_FILE_PRIVATE; + else + vma_area->e->status |= VMA_FILE_SHARED; } else { /* * No file but mapping -- anonymous one. @@ -646,9 +692,23 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area struct vma_file_info *vfi, struct vma_file_info *prev_vfi) { if (vma_area->e->status & VMA_UNSUPP) { - pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, - vma_area->e->end); - return -1; + if (vfi->has_device_plugin) { + /* Unsupported VMAs that provide special plugins for + * backup can be treated as regular VMAs and criu + * should only save their metadata in the dump files. + * There can be several special backup plugins hooks + * that might run at different stages during checkpoint + * and restore. + */ + pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " " + "must be supported via device plugins\n", + vma_area->e->start, vma_area->e->end); + + } else { + pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + return -1; + } } /* Add a guard page only if here is enough space for it */ @@ -1492,6 +1552,59 @@ int parse_timens_offsets(struct timespec *boff, struct timespec *moff) return exit_code; } +static int get_mountinfo_sdev_from_mntid(int mnt_id, unsigned int *sdev) +{ + int exit_code = -1; + FILE *f; + + f = fopen_proc(PROC_SELF, "mountinfo"); + if (!f) + return -1; + + while (fgets(buf, BUF_SIZE, f)) { + unsigned int kmaj, kmin; + int id; + + if (sscanf(buf, "%i %*i %u:%u", &id, &kmaj, &kmin) != 3) { + pr_err("Failed to parse mountinfo line %s\n", buf); + goto err; + } + + if (id == mnt_id) { + *sdev = MKKDEV(kmaj, kmin); + exit_code = 0; + break; + } + } +err: + fclose(f); + return exit_code; +} + +/* This works even on btrfs where stat does not show right sdev */ +int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo) +{ + struct mount_info *mi; + int ret, mnt_id; + + ret = get_fd_mntid(fd, &mnt_id); + if (ret < 0) + return -1; + + /* Simple case mnt_id is in dumped mntns */ + mi = lookup_mnt_id(mnt_id); + if (mi) { + *sdev = mi->s_dev_rt; + return 0; + } + + if (!parse_mountinfo) + return -1; + + /* Complex case mnt_id is in mntns created by criu */ + return get_mountinfo_sdev_from_mntid(mnt_id, sdev); +} + struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) { struct mount_info *list = NULL; @@ -1506,7 +1619,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) int ret = -1; char *fsname = NULL; - new = mnt_entry_alloc(); + new = mnt_entry_alloc(false); if (!new) goto end; @@ -1710,7 +1823,263 @@ static int parse_bpfmap(struct bfd *f, char *str, BpfmapFileEntry *bpf) #define fdinfo_field(str, field) !strncmp(str, field ":", sizeof(field)) +static int parse_io_uring(struct bfd *f, char *str, struct io_uring_ctx *ctx) +{ + IoUringFileEntry *iofe = io_uring_get_iofe(ctx); + unsigned int nr; + pid_t pid; + int r; + + /* + * Format is: + * + * SqThread: %d + * SqThreadCpu: %d + * UserFiles: %u (number of registered files) (OPTIONAL DATA) + * %5u: %s (idx: filename) + * UserBufs: %u (number of registered buffers) (OPTIONAL DATA) + * %5u: 0x%llx/%u (idx: 0xaddr/len) + * Personalities: (OPTIONAL HEADING and DATA) + * %5d (id) + * Uid: %llu %llu %llu %llu (uid euid suid fsuid) + * Gid: %llu %llu %llu %llu (gid egid sgid fsgid) + * Groups: %llu %llu ... %llu (groups) + * CapEff: %llx ... %llx + * PollList: (OPTIONAL DATA) + * op=%d, task_works=%d (op=opcode, task_works=0 or 1) + * --- (Added by patch) + * Locked: %d (0 or 1) + * SqThreadIdle: %u + * SetupFlags: 0x%x + * SqEntries: %u + * CqEntries: %u + * SqOffArray: %u + * ... (OPTIONAL FIELDS) + * RestrictRegisterOp: %s (bitmap) + * RestrictSqeOp: %s (bitmap) + * RestrictSqeFlagsAllowed: %c (u8) + * RestrictSqeFlagsRequired: %c (u8) + */ + + if (sscanf(str, "SqThread: %d", &pid) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqThreadCpu: %d", &iofe->sq_thread_cpu) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "UserFiles: %u", &nr) != 1) + goto end; + if (nr) { + /* Not supported, yet */ + pr_warn("Registered files dump unsupported\n"); + return -ENOTSUP; + do { + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (!strncmp(str, "UserBufs", sizeof("UserBufs") - 1)) + break; + /* skip line, we use eBPF iterator to collect the file + * set registered with io_uring */ + } while (true); + } + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "UserBufs: %u", &nr) != 1) + goto end; + for (int i = 0; i < nr; i++) { + long long unsigned int address; + unsigned int idx, len; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "%5u: 0x%llx/%u", &idx, &address, &len) != 3) + goto end; + + if (io_uring_push_buf(ctx, idx, address, len)) + goto end; + } + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (!strncmp(str, "Personalities", sizeof("Personalities") - 1)) { + for (;;) { + struct io_uring_personality_desc desc = {}; + struct io_uring_group_desc *g, *gtmp; + char *tok; + int id; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + str = str + strspn(str, " "); + if (!strncmp(str, "PollList", sizeof("PollList") - 1)) + break; + else if (sscanf(str, "%5d", &id) != 1) + goto end; + desc.id = id; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + str = str + strspn(str, " "); + if (sscanf(str, " Uid: %u %u %u %u", &desc.uid, &desc.euid, &desc.suid, &desc.fsuid) != 4) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, " Gid: %u %u %u %u", &desc.gid, &desc.egid, &desc.sgid, &desc.fsgid) != 4) + goto end; + + INIT_LIST_HEAD(&desc.group_list); + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + str = strstr(str, ":"); + tok = str + 2; + while ((tok = strtok(tok, " "))) { + struct io_uring_group_desc *gdesc; + + gdesc = xzalloc(sizeof(*gdesc)); + if (!gdesc) + goto end_free; + INIT_LIST_HEAD(&gdesc->list); + + if (sscanf(tok, "%u", &gdesc->group) != 1) + goto end_free; + list_add_tail(&gdesc->list, &desc.group_list); + desc.nr_groups++; + tok = NULL; + } + + /* CapEff */ + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end_free; + str = strstr(str, ":"); + str += 2; + if (cap_parse(str, desc.cap_eff)) + goto end_free; + + if (io_uring_push_personality(ctx, &desc)) + goto end_free; + continue; + end_free: + list_for_each_entry_safe(g, gtmp, &desc.group_list, list) + xfree(g); + goto end; + } + } + + /* PollList: */ + for (; str; str = breadline(f)) { + if (IS_ERR(str)) + goto end; + /* Skip leading space */ + str = str + strspn(str, " "); + if (!strncmp(str, "op", sizeof("op") - 1)) + continue; + else + break; + } + if (IS_ERR_OR_NULL(str)) + goto end; + + /* str obtained from above */ + if (sscanf(str, "Locked: %d", &r) != 1) + goto end; + if (!r) { + pr_err("fdinfo read for io_uring could not take ctx->uring_lock inside kernel\n" + "This indicates that the ring is not idle, hence cannot proceed\n"); + goto end; + } + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqThreadIdle: %u", &iofe->sq_thread_idle) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SetupFlags: %u", &iofe->setup_flags) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqEntries: %u", &iofe->sq_entries) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "CqEntries: %u", &iofe->cq_entries) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqOffArray: %u", &iofe->sq_off_array) != 1) + goto end; + + /* Printing restrictions is optional */ + str = breadline(f); + if (IS_ERR(str)) + goto end; + if (!str) + return 0; + nr = 0; + /* Upper bits are unused in bitmap */ + if (sscanf(str, "RestrictRegisterOp: %x,%x", &nr, &iofe->reg_op) != 2) { + /* 32-bit long? */ + if (sscanf(str, "RestrictRegisterOp: %x", &iofe->reg_op) != 1) + goto end; + } + BUG_ON(nr); + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "RestrictSqeOp: %x,%x", &nr, &iofe->sqe_op) != 2) { + if (sscanf(str, "RestrictSqeOp: %x", &iofe->sqe_op) != 1) + goto end; + } + BUG_ON(nr); + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "RestrictSqeFlagsAllowed: 0x%x", &iofe->sqe_flags_allowed) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "RestrictSqeFlagsRequired: 0x%x", &iofe->sqe_flags_required) != 1) + goto end; + iofe->restrictions = true; + + return 0; +end: + pr_err("Incomplete io_uring fdinfo support\n"); + return -1; +} + static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked); + static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) { struct bfd f; @@ -2030,6 +2399,21 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) entry_met = true; continue; } + if (fdinfo_field(str, "ino")) { + if (type != FD_TYPES__IO_URING) + goto parse_err; + + str = breadline(&f); + if (IS_ERR_OR_NULL(str)) + goto parse_err; + + ret = parse_io_uring(&f, str, arg); + if (ret) + goto parse_err; + + entry_met = true; + continue; + } } exit_code = 0; diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5be..9c267de20b 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/io_uring.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; diff --git a/criu/seize.c b/criu/seize.c index 95bf9ef0c1..58564ca746 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -131,11 +131,11 @@ static enum freezer_state get_freezer_state(int fd) return get_freezer_v1_state(fd); } -static bool freezer_thawed; +static enum freezer_state origin_freezer_state = FREEZER_ERROR; const char *get_real_freezer_state(void) { - return freezer_thawed ? thawed : frozen; + return origin_freezer_state == THAWED ? thawed : frozen; } static int freezer_write_state(int fd, enum freezer_state new_state) @@ -192,7 +192,7 @@ static int freezer_restore_state(void) int fd; int ret; - if (!opts.freeze_cgroup || freezer_thawed) + if (!opts.freeze_cgroup || origin_freezer_state != FROZEN) return 0; fd = freezer_open(); @@ -481,9 +481,10 @@ static int freeze_processes(void) close(fd); return -1; } - if (state == THAWED) { - freezer_thawed = true; + origin_freezer_state = state == FREEZING ? FROZEN : state; + + if (state == THAWED) { if (freezer_write_state(fd, FROZEN)) { close(fd); return -1; @@ -534,7 +535,7 @@ static int freeze_processes(void) } err: - if (exit_code == 0 || freezer_thawed) + if (exit_code == 0 || origin_freezer_state == THAWED) exit_code = freezer_write_state(fd, THAWED); if (close(fd)) { diff --git a/criu/shmem.c b/criu/shmem.c index 1b83327ef2..a9ee8d7eb7 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -81,11 +81,12 @@ struct shmem_info { * an region. Each time when we found a process with a smaller pid, * we reset self_count, so we can't have only one counter. */ - int count; /* the number of regions */ + int count; /* the number of regions */ int self_count; /* the number of regions, which belongs to "pid" */ }; - struct { /* For sysvipc restore */ + /* For sysvipc restore */ + struct { struct list_head att; /* list of shmem_sysv_att-s */ int want_write; }; diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 0afecd2d60..96d5d13bf6 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -451,7 +451,7 @@ int restore_one_tcp(int fd, struct inet_sk_info *ii) pr_info("Restoring TCP connection\n"); - if (opts.tcp_close && ii->ie->state != TCP_LISTEN && ii->ie->state != TCP_CLOSE) { + if (opts.tcp_close) { if (shutdown(fd, SHUT_RDWR) && errno != ENOTCONN) { pr_perror("Unable to shutdown the socket id %x ino %x", ii->ie->id, ii->ie->ino); } diff --git a/criu/sk-unix.c b/criu/sk-unix.c index f3fe60c6eb..194193dff1 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -402,12 +402,12 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) sk_encode_shutdown(ue, sk->shutdown); /* - * If a stream listening socket has non-zero rqueue, this - * means there are in-flight connections waiting to get + * If a stream/seqpacket listening socket has non-zero rqueue, + * this means there are in-flight connections waiting to get * accept()-ed. We handle them separately with the "icons" * (i stands for in-flight, cons -- for connections) things. */ - if (sk->rqlen != 0 && !(sk->type == SOCK_STREAM && sk->state == TCP_LISTEN)) { + if (sk->rqlen != 0 && sk->state != TCP_LISTEN) { if (dump_sk_queue(lfd, id)) goto err; } @@ -460,7 +460,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) pr_warn("Shutdown mismatch %u:%d -> %u:%d\n", ue->ino, ue->shutdown, peer->sd.ino, peer->shutdown); } - } else if (ue->state == TCP_ESTABLISHED) { + } else if (ue->state == TCP_ESTABLISHED && ue->type != SOCK_DGRAM) { const struct unix_sk_listen_icon *e; e = lookup_unix_listen_icons(ue->ino); @@ -958,9 +958,9 @@ struct unix_sk_info { struct unix_sk_info *peer; struct pprep_head peer_resolve; /* XXX : union with the above? */ struct file_desc d; - struct hlist_node hash; /* To lookup socket by ino */ + struct hlist_node hash; /* To lookup socket by ino */ struct list_head connected; /* List of sockets, connected to me */ - struct list_head node; /* To link in peer's connected list */ + struct list_head node; /* To link in peer's connected list */ struct list_head scm_fles; struct list_head ghost_node; size_t ghost_dir_pos; @@ -1610,7 +1610,7 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui) if (ui->ue->name.len == 0) return 0; - if ((ui->ue->type == SOCK_STREAM) && (ui->ue->state == TCP_ESTABLISHED)) { + if ((ui->ue->type != SOCK_DGRAM) && (ui->ue->state == TCP_ESTABLISHED)) { /* * FIXME this can be done, but for doing this properly we * need to bind socket to its name, then rename one to @@ -1851,14 +1851,10 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) close(sks[1]); sk = sks[0]; - } else if (ui->ue->state == TCP_ESTABLISHED && queuer && queuer->ue->ino == FAKE_INO) { + } else if ((ui->ue->state == TCP_ESTABLISHED && ui->ue->type != SOCK_DGRAM) && queuer && + queuer->ue->ino == FAKE_INO) { int ret, sks[2]; - if (ui->ue->type != SOCK_STREAM) { - pr_err("Non-stream socket %u in established state\n", ui->ue->ino); - return -1; - } - if (ui->ue->shutdown != SK_SHUTDOWN__BOTH) { pr_err("Wrong shutdown/peer state for %u\n", ui->ue->ino); return -1; diff --git a/criu/sockets.c b/criu/sockets.c index 9426b5b940..db772707b6 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -517,8 +517,12 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ - ret |= userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk); + ret |= userns_call(sk_setbufs, 0, bufs, sizeof(bufs), sk); + if (soe->has_so_buf_lock) { + pr_debug("\trestore buf_lock %d for socket\n", soe->so_buf_lock); + ret |= restore_opt(sk, SOL_SOCKET, SO_BUF_LOCK, &soe->so_buf_lock); + } if (soe->has_so_priority) { pr_debug("\trestore priority %d for socket\n", soe->so_priority); ret |= restore_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); @@ -619,6 +623,10 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) ret |= dump_opt(sk, SOL_SOCKET, SO_SNDBUF, &soe->so_sndbuf); ret |= dump_opt(sk, SOL_SOCKET, SO_RCVBUF, &soe->so_rcvbuf); + if (kdat.has_sockopt_buf_lock) { + soe->has_so_buf_lock = true; + ret |= dump_opt(sk, SOL_SOCKET, SO_BUF_LOCK, &soe->so_buf_lock); + } soe->has_so_priority = true; ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); soe->has_so_rcvlowat = true; diff --git a/criu/tls.c b/criu/tls.c index 60bd105bc9..4feaf613b5 100644 --- a/criu/tls.c +++ b/criu/tls.c @@ -8,7 +8,7 @@ #include "cr_options.h" #include "xmalloc.h" -/* Compatability with GnuTLS verson <3.5 */ +/* Compatability with GnuTLS version < 3.5 */ #ifndef GNUTLS_E_CERTIFICATE_VERIFICATION_ERROR #define GNUTLS_E_CERTIFICATE_VERIFICATION_ERROR GNUTLS_E_CERTIFICATE_ERROR #endif @@ -31,7 +31,7 @@ static gnutls_certificate_credentials_t x509_cred; static int tls_sk = -1; static int tls_sk_flags = 0; -void tls_terminate_session(void) +void tls_terminate_session(bool async) { int ret; @@ -40,20 +40,26 @@ void tls_terminate_session(void) if (session) { do { - /* don't wait for peer to close connection */ - ret = gnutls_bye(session, GNUTLS_SHUT_WR); + /* + * Initiate a connection shutdown but don't + * wait for peer to close connection. + */ + ret = gnutls_bye(session, async ? GNUTLS_SHUT_WR : GNUTLS_SHUT_RDWR); } while (ret == GNUTLS_E_AGAIN || ret == GNUTLS_E_INTERRUPTED); + /* Free the session object */ gnutls_deinit(session); } tls_sk = -1; + + /* Free the credentials object */ if (x509_cred) gnutls_certificate_free_credentials(x509_cred); } ssize_t tls_send(const void *buf, size_t len, int flags) { - int ret; + ssize_t ret; tls_sk_flags = flags; ret = gnutls_record_send(session, buf, len); @@ -95,7 +101,7 @@ int tls_send_data_from_fd(int fd, unsigned long len) return -1; while (len > 0) { - int ret, sent; + ssize_t ret, sent; copied = read(fd, buf, min(len, buf_size)); if (copied <= 0) { @@ -119,7 +125,7 @@ int tls_send_data_from_fd(int fd, unsigned long len) ssize_t tls_recv(void *buf, size_t len, int flags) { - int ret; + ssize_t ret; tls_sk_flags = flags; ret = gnutls_record_recv(session, buf, len); @@ -163,7 +169,7 @@ int tls_recv_data_to_fd(int fd, unsigned long len) gnutls_packet_t packet; while (len > 0) { - int ret, w; + ssize_t ret, w; gnutls_datum_t pdata; ret = gnutls_record_recv_packet(session, &packet); @@ -229,6 +235,7 @@ static int tls_handshake(void) { int ret = -1; while (ret != GNUTLS_E_SUCCESS) { + /* Establish TLS session */ ret = gnutls_handshake(session); if (gnutls_error_is_fatal(ret)) { tls_perror("TLS handshake failed", ret); @@ -257,6 +264,7 @@ static int tls_x509_setup_creds(void) if (opts.tls_key) key = opts.tls_key; + /* Load the trusted CA certificates */ ret = gnutls_certificate_allocate_credentials(&x509_cred); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to allocate x509 credentials", ret); @@ -298,10 +306,14 @@ static int tls_x509_setup_creds(void) return 0; } +/** + * A function used by gnutls to send data. It returns a positive + * number indicating the bytes sent, and -1 on error. + */ static ssize_t _tls_push_cb(void *p, const void *data, size_t sz) { int fd = *(int *)(p); - int ret = send(fd, data, sz, tls_sk_flags); + ssize_t ret = send(fd, data, sz, tls_sk_flags); if (ret < 0 && errno != EAGAIN) { int _errno = errno; pr_perror("Push callback send failed"); @@ -310,10 +322,15 @@ static ssize_t _tls_push_cb(void *p, const void *data, size_t sz) return ret; } +/** + * A callback function used by gnutls to receive data. + * It returns 0 on connection termination, a positive number + * indicating the number of bytes received, and -1 on error. + */ static ssize_t _tls_pull_cb(void *p, void *data, size_t sz) { int fd = *(int *)(p); - int ret = recv(fd, data, sz, tls_sk_flags); + ssize_t ret = recv(fd, data, sz, tls_sk_flags); if (ret < 0 && errno != EAGAIN) { int _errno = errno; pr_perror("Pull callback recv failed"); @@ -326,26 +343,33 @@ static int tls_x509_setup_session(unsigned int flags) { int ret; + /* Create the session object */ ret = gnutls_init(&session, flags); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to initialize session", ret); return -1; } + /* Install the trusted certificates */ ret = gnutls_credentials_set(session, GNUTLS_CRD_CERTIFICATE, x509_cred); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to set session credentials", ret); return -1; } + /* Configure the cipher preferences */ ret = gnutls_set_default_priority(session); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to set priority", ret); return -1; } + /* Associate the socket with the session object */ gnutls_transport_set_ptr(session, &tls_sk); + + /* Set a push function for gnutls to use to send data */ gnutls_transport_set_push_function(session, _tls_push_cb); + /* set a pull function for gnutls to use to receive data */ gnutls_transport_set_pull_function(session, _tls_pull_cb); if (flags == GNUTLS_SERVER) { @@ -375,6 +399,6 @@ int tls_x509_init(int sockfd, bool is_server) return 0; err: - tls_terminate_session(); + tls_terminate_session(true); return -1; } diff --git a/criu/tty.c b/criu/tty.c index 1598ad9562..1462193c5e 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -1977,6 +1977,12 @@ static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p) pr_info("Dumping tty %d with id %#x\n", lfd, id); driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev); + if (driver == NULL) { + pr_err("Unable to find a tty driver (rdev %#" PRIx64 " dev %#" PRIx64 ")\n", p->stat.st_rdev, + p->stat.st_dev); + return -1; + } + if (driver->fd_get_index) index = driver->fd_get_index(lfd, p); else diff --git a/criu/uffd.c b/criu/uffd.c index 18bdc040f0..45ac8ba774 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -71,8 +71,8 @@ static mutex_t *lazy_sock_mutex; struct lazy_iov { struct list_head l; - unsigned long start; /* run-time start address, tracks remaps */ - unsigned long end; /* run-time end address, tracks remaps */ + unsigned long start; /* run-time start address, tracks remaps */ + unsigned long end; /* run-time end address, tracks remaps */ unsigned long img_start; /* start address at the dump time */ }; @@ -1468,7 +1468,7 @@ int cr_lazy_pages(bool daemon) ret = handle_requests(epollfd, &events, nr_fds); - tls_terminate_session(); + disconnect_from_page_server(); xfree(events); return ret; diff --git a/criu/util.c b/criu/util.c index 06124c2205..4b924ae0d9 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1,4 +1,4 @@ -#define _XOPEN_SOURCE +#define _XOPEN_SOURCE 500 #include #include @@ -26,6 +26,7 @@ #include #include #include +#include #include "linux/mount.h" @@ -187,6 +188,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_IO_URING, "io_uring"); #undef opt2s } @@ -1106,7 +1108,7 @@ int setup_tcp_server(char *type, char *addr, unsigned short *port) int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) { int ret; - struct sockaddr_in caddr; + struct sockaddr_storage caddr; socklen_t clen = sizeof(caddr); if (daemon_mode) { @@ -1134,13 +1136,20 @@ int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) return -1; if (sk >= 0) { + char port[6]; + char address[INET6_ADDRSTRLEN]; *ask = accept(sk, (struct sockaddr *)&caddr, &clen); if (*ask < 0) { pr_perror("Can't accept connection to server"); goto err; - } else - pr_info("Accepted connection from %s:%u\n", inet_ntoa(caddr.sin_addr), - (int)ntohs(caddr.sin_port)); + } + ret = getnameinfo((struct sockaddr *)&caddr, clen, address, sizeof(address), port, sizeof(port), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret) { + pr_err("Failed converting address: %s\n", gai_strerror(ret)); + goto err; + } + pr_info("Accepted connection from %s:%s\n", address, port); close(sk); } @@ -1613,44 +1622,25 @@ ssize_t write_all(int fd, const void *buf, size_t size) return n; } -int rm_rf(char *target) +static int remove_one(const char *fpath, const struct stat *sb, int tflag, struct FTW *ftwbuf) { - int offset = strlen(target); - DIR *dir = NULL; - struct dirent *de; - int ret = -1; + int ret; - dir = opendir(target); - if (!dir) { - pr_perror("unable to open %s", target); + ret = remove(fpath); + if (ret) { + pr_perror("rmrf: unable to remove %s", fpath); return -1; } - while ((de = readdir(dir))) { - int n; - - if (dir_dots(de)) - continue; - - n = snprintf(target + offset, PATH_MAX - offset, "/%s", de->d_name); - if (n < 0 || n >= PATH_MAX) { - pr_err("snprintf failed\n"); - goto out; - } + return 0; +} - if (de->d_type == DT_DIR && rm_rf(target)) - goto out; +#define NFTW_FD_MAX 64 - if (remove(target) < 0) { - pr_perror("unable to remove %s", target); - goto out; - } - } - - ret = 0; -out: - target[offset] = 0; - return ret; +int rmrf(char *path) +{ + pr_debug("rmrf: removing %s\n", path); + return nftw(path, remove_one, NFTW_FD_MAX, FTW_DEPTH | FTW_PHYS); } __attribute__((returns_twice)) static pid_t raw_legacy_clone(unsigned long flags, int *pidfd) @@ -1687,8 +1677,8 @@ __attribute__((returns_twice)) static pid_t raw_legacy_clone(unsigned long flags */ "addx %%g0, 0, %%g1" : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ - : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ - : "%cc"); /* clobbers */ + : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ + : "%cc"); /* clobbers */ is_error = g1; retval = o0; diff --git a/flog/Makefile b/flog/Makefile new file mode 100644 index 0000000000..12255af719 --- /dev/null +++ b/flog/Makefile @@ -0,0 +1,29 @@ +OPTS=-ggdb3 -Wall -Werror +export OPTS + +CFLAGS += -iquote include +CFLAGS += -iquote flog/include +CFLAGS += -iquote flog/include/uapi + +include $(__nmk_dir)msg.mk + +$(eval $(call gen-built-in,src)) + +flog: + $(Q) $(MAKE) $(build)=$(obj)/src all +.PHONY: flog + +clean-flog: + $(call msg-gen, $@) + $(Q) $(MAKE) $(build)=$(obj)/src clean + $(Q) $(RM) built-in.o +.PHONY: clean-flog + +clean: clean-flog +mrproper: clean + +test: + ./tests/test00 + +all-y += flog + diff --git a/flog/built-in.S b/flog/built-in.S new file mode 100644 index 0000000000..26627d0544 --- /dev/null +++ b/flog/built-in.S @@ -0,0 +1,4 @@ +SECTIONS +{ + .rodata : { _rodata_start = . ; *(.rodata*) ; _rodata_end = . ;} +} diff --git a/flog/include/compiler.h b/flog/include/compiler.h new file mode 100644 index 0000000000..80264ec631 --- /dev/null +++ b/flog/include/compiler.h @@ -0,0 +1,77 @@ +#ifndef __COMPILER_H__ +#define __COMPILER_H__ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +#define __section(S) __attribute__((__section__(#S))) + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef always_inline +#define always_inline __always_inline +#endif + +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +#define __aligned(x) __attribute__((aligned(x))) + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER) +#endif + +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) + +#define min(x, y) \ + ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void)(&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ + }) + +#define max(x, y) \ + ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void)(&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; \ + }) + +#define is_log2(v) (((v) & ((v)-1)) == 0) + +#endif /* __COMPILER_H__ */ diff --git a/flog/include/flog.h b/flog/include/flog.h new file mode 100644 index 0000000000..f00c20541f --- /dev/null +++ b/flog/include/flog.h @@ -0,0 +1,9 @@ +#ifndef __FLOG_H__ +#define __FLOG_H__ + +#include +#include + +#include "uapi/flog.h" + +#endif /* __FLOG_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h new file mode 100644 index 0000000000..8aafe44b75 --- /dev/null +++ b/flog/include/log.h @@ -0,0 +1,17 @@ +#ifndef __LOG_H__ +#define __LOG_H__ + +#include + +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) + +#if 1 +#define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#else +#define pr_debug(fmt, ...) +#endif + +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) + +#endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h new file mode 100644 index 0000000000..07c992968b --- /dev/null +++ b/flog/include/types.h @@ -0,0 +1,16 @@ +#ifndef __FLOG_TYPES_H__ +#define __FLOG_TYPES_H__ + +#include +#include + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h new file mode 100644 index 0000000000..6061f4556a --- /dev/null +++ b/flog/include/uapi/flog.h @@ -0,0 +1,139 @@ +#ifndef __UAPI_FLOG_H__ +#define __UAPI_FLOG_H__ + +#include +#include +#include + +/* + * We work with up to 32 arguments in macros here. + * If more provided -- behaviour is undefined. + */ + +/* + * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s + */ +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) + +#define FLOG_PP_ARG_N(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, \ + _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, N, ...) \ + N + +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, \ + 2, 1, 0 + +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) + +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 + +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) + +#define flog_genbit(ord, n, v, ...) \ + _Generic((v), \ + \ + /* Basic types */ \ + char: 0, \ + signed char: 0, \ + unsigned char: 0, \ + signed short int: 0, \ + unsigned short int: 0, \ + signed int: 0, \ + unsigned int: 0, \ + signed long: 0, \ + unsigned long: 0, \ + signed long long: 0, \ + unsigned long long: 0, \ + \ + /* Not used for a while */ \ + /* float: 12, */ \ + /* double: 13, */ \ + /* long double: 14, */ \ + \ + /* Basic poniters */ \ + char *: (1u << (ord - n - 1)), \ + signed char *: (1u << (ord - n - 1)), \ + unsigned char *: (1u << (ord - n - 1)), \ + signed short int *: 0, \ + unsigned short int *: 0, \ + signed int *: 0, \ + unsigned int *: 0, \ + signed long *: 0, \ + unsigned long *: 0, \ + signed long long *: 0, \ + unsigned long long *: 0, \ + void *: 0, \ + \ + /* Const basic pointers */ \ + const char *: (1u << (ord - n - 1)), \ + const signed char *: (1u << (ord - n - 1)), \ + const unsigned char *: (1u << (ord - n - 1)), \ + const signed short int *: 0, \ + const unsigned short int *: 0, \ + const signed int *: 0, \ + const unsigned int *: 0, \ + const signed long *: 0, \ + const unsigned long *: 0, \ + const signed long long *: 0, \ + const unsigned long long *: 0, \ + const void *: 0, \ + \ + /* Systypes and pointers */ \ + default: -1) + +typedef struct { + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; +} flog_msg_t; + +extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); +void flog_decode_msg(int fdout, const char *format, ...); +extern int flog_decode_all(int fdin, int fdout); + +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) + +int flog_map_buf(int fdout); +int flog_close(int fdout); + +#endif /* __UAPI_FLOG_H__ */ diff --git a/flog/include/util.h b/flog/include/util.h new file mode 100644 index 0000000000..7b1edb6885 --- /dev/null +++ b/flog/include/util.h @@ -0,0 +1,41 @@ +#ifndef __UTIL_H__ +#define __UTIL_H__ + +#include +#include + +#include "log.h" +#include "types.h" + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) \ + do { \ + if (p) \ + free(p); \ + } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +#endif /* __UTIL_H__ */ diff --git a/flog/src/Makefile b/flog/src/Makefile new file mode 100644 index 0000000000..ee73ea7252 --- /dev/null +++ b/flog/src/Makefile @@ -0,0 +1,5 @@ +ccflags-y += -DCONFIG_X86_64 -iquote ./include $(OPTS) +ldflags-y += -r + +#obj-y += main.o +obj-y += flog.o diff --git a/flog/src/flog.c b/flog/src/flog.c new file mode 100644 index 0000000000..8f11a36cbf --- /dev/null +++ b/flog/src/flog.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include + +//#include + +#include "uapi/flog.h" +#include "util.h" + +#define MAGIC 0xABCDABCD + +#define BUF_SIZE (1 << 20) +static char _mbuf[BUF_SIZE]; +static char *mbuf = _mbuf; +static char *fbuf; +static uint64_t fsize; +static uint64_t mbuf_size = sizeof(_mbuf); + +/*int flog_decode_all(int fdin, int fdout) +{ + flog_msg_t *m = (void *)mbuf; + ffi_type *args[34] = { + [0] = &ffi_type_sint, + [1] = &ffi_type_pointer, + [2 ... 33] = &ffi_type_slong + }; + void *values[34]; + ffi_cif cif; + ffi_arg rc; + size_t i, ret; + char *fmt; + + values[0] = (void *)&fdout; + + while (1) { + ret = read(fdin, mbuf, sizeof(m)); + if (ret == 0) + break; + if (ret < 0) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + if (m->magic != MAGIC) { + fprintf(stderr, "The log file was not properly closed\n"); + break; + } + ret = m->size - sizeof(m); + if (m->size > mbuf_size) { + fprintf(stderr, "The buffer is too small"); + return -1; + } + if (read(fdin, mbuf + sizeof(m), ret) != ret) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + + fmt = mbuf + m->fmt; + values[1] = &fmt; + + for (i = 0; i < m->nargs; i++) { + values[i + 2] = (void *)&m->args[i]; + if (m->mask & (1u << i)) { + m->args[i] = (long)(mbuf + m->args[i]); + } + } + + if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, m->nargs + 2, + &ffi_type_sint, args) == FFI_OK) + ffi_call(&cif, FFI_FN(dprintf), &rc, values); + } + return 0; +}*/ + +static int flog_enqueue(flog_msg_t *m) +{ + if (write(1, m, m->size) != m->size) { + fprintf(stderr, "Unable to write a message\n"); + return -1; + } + return 0; +} + +/*extern char *rodata_start; +extern char *rodata_end; +*/ +/* Pre-allocate a buffer in a file and map it into memory. */ +int flog_map_buf(int fdout) +{ + uint64_t off = 0; + void *addr; + + /* + * Two buffers are mmaped into memory. A new one is mapped when a first + * one is completly filled. + */ + if (fbuf && (mbuf - fbuf < BUF_SIZE)) + return 0; + + if (fbuf) { + if (munmap(fbuf, BUF_SIZE * 2)) { + fprintf(stderr, "Unable to unmap a buffer: %m"); + return -1; + } + off = mbuf - fbuf - BUF_SIZE; + fbuf = NULL; + } + + if (fsize == 0) + fsize += BUF_SIZE; + fsize += BUF_SIZE; + + if (ftruncate(fdout, fsize)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + + if (!fbuf) + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, fdout, + fsize - 2 * BUF_SIZE); + else + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, BUF_SIZE * 2, MREMAP_FIXED, fbuf); + if (addr == MAP_FAILED) { + fprintf(stderr, "Unable to map a buffer: %m"); + return -1; + } + + fbuf = addr; + mbuf = fbuf + off; + mbuf_size = 2 * BUF_SIZE; + + return 0; +} + +int flog_close(int fdout) +{ + if (mbuf == _mbuf) + return 0; + + munmap(fbuf, BUF_SIZE * 2); + + if (ftruncate(fdout, fsize - 2 * BUF_SIZE + mbuf - fbuf)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + return 0; +} + +int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...) +{ + flog_msg_t *m; + va_list argptr; + char *str_start, *p; + size_t i; + + if (mbuf != _mbuf && flog_map_buf(fdout)) + return -1; + + m = (void *)mbuf; + + m->nargs = nargs; + m->mask = mask; + + str_start = (void *)m->args + sizeof(m->args[0]) * nargs; + p = memccpy(str_start, format, 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->fmt = str_start - mbuf; + str_start = p; + + va_start(argptr, format); + for (i = 0; i < nargs; i++) { + m->args[i] = (long)va_arg(argptr, long); + /* + * If we got a string, we should either + * reference it when in rodata, or make + * a copy (FIXME implement rodata refs). + */ + if (mask & (1u << i)) { + p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + va_end(argptr); + return -1; + } + m->args[i] = str_start - mbuf; + str_start = p; + } + } + va_end(argptr); + m->size = str_start - mbuf; + + /* + * A magic is required to know where we stop writing into a log file, + * if it was not properly closed. The file is mapped into memory, so a + * space in the file is allocated in advance and at the end it can have + * some unused tail. + */ + m->magic = MAGIC; + + m->size = roundup(m->size, 8); + if (mbuf == _mbuf) { + if (flog_enqueue(m)) + return -1; + } else { + mbuf += m->size; + mbuf_size -= m->size; + } + return 0; +} diff --git a/flog/src/main.c b/flog/src/main.c new file mode 100644 index 0000000000..fc5d64ebd2 --- /dev/null +++ b/flog/src/main.c @@ -0,0 +1,159 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "flog.h" + +extern char _rodata_start, _rodata_end; +char *rodata_start = &_rodata_start; +char *rodata_end = &_rodata_end; + +enum { + MODE_BINARY, + MODE_FPRINTF, + MODE_SPRINTF, + MODE_DPRINTF, +}; + +int main(int argc, char *argv[]) +{ + static const char str1[] = "String1 String1"; + static const char str2[] = "string2 string2 string2"; + int fdout = STDOUT_FILENO; + bool use_decoder = false; + int mode = MODE_BINARY; + size_t niter = 100; + int opt, idx; + size_t i; + + static const char short_opts[] = "m:o:di:h"; + static struct option long_opts[] = { + { "mode", required_argument, 0, 'm' }, { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, {}, + }; + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch (opt) { + case 'm': + if (strcmp(optarg, "binary") == 0) { + mode = MODE_BINARY; + } else if (strcmp(optarg, "fprintf") == 0) { + mode = MODE_FPRINTF; + } else if (strcmp(optarg, "sprintf") == 0) { + mode = MODE_SPRINTF; + } else if (strcmp(optarg, "dprintf") == 0) { + mode = MODE_DPRINTF; + } else + goto usage; + break; + case 'o': + if (strcmp(optarg, "stdout") == 0) { + fdout = fileno(stdout); + } else if (strcmp(optarg, "stderr") == 0) { + fdout = fileno(stderr); + } else { + fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fdout < 0) { + fprintf(stderr, "Can't open %s: %s\n", optarg, strerror(errno)); + exit(1); + } + } + break; + case 'i': + niter = atoi(optarg); + break; + case 'd': + use_decoder = true; + break; + case 'h': + default: + goto usage; + } + } + + switch (mode) { + case MODE_BINARY: + if (use_decoder) + return flog_decode_all(STDIN_FILENO, fdout); + + if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) + return 1; + for (i = 0; i < niter; i++) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, + (short)2, (unsigned long)2)) + return 1; + if (flog_close(fdout)) + return 1; + break; + case MODE_DPRINTF: { + for (i = 0; i < niter; i++) { + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + case MODE_FPRINTF: { + FILE *f = fdopen(fdout, "w"); + + for (i = 0; i < niter; i++) { + fprintf(f, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + fflush(f); + } + fclose(f); + break; + } + case MODE_SPRINTF: { + static char buf[4096]; + + for (i = 0; i < niter; i++) { + sprintf(buf, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + default: + return 1; + } + + return 0; +usage: + fprintf(stderr, + "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writting results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); + return 1; +} diff --git a/flog/tests/test00 b/flog/tests/test00 new file mode 100755 index 0000000000..a7937e4a18 --- /dev/null +++ b/flog/tests/test00 @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e -x + +echo Map a log file into memory +time ./flog run -i 1000000 -o /tmp/flog.raw.map +echo Write into a log file +time ./flog run -i 1000000 > /tmp/flog.raw +echo Use fprintf +time ./flog run -m fprintf -i 1000000 -o /tmp/flog.fprintf.txt +echo Use dprintf +time ./flog run -m dprintf -i 1000000 -o /tmp/flog.dprintf.txt +echo Use sprintf +time ./flog run -m sprintf -i 1000000 + +time ./flog run -d < /tmp/flog.raw > /tmp/flog.raw.txt +cmp /tmp/flog.raw.txt /tmp/flog.fprintf.txt + +time ./flog run -d < /tmp/flog.raw.map > /tmp/flog.raw.map.txt +cmp /tmp/flog.raw.map.txt /tmp/flog.fprintf.txt + +cmp /tmp/flog.dprintf.txt /tmp/flog.fprintf.txt diff --git a/images/Makefile b/images/Makefile index 2eaeb7cad2..58e585ad52 100644 --- a/images/Makefile +++ b/images/Makefile @@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o +proto-obj-y += io_uring.o CFLAGS += -iquote $(obj)/ diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c11860..7530315448 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -20,6 +20,7 @@ import "pipe.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; +import "io_uring.proto"; enum fd_types { UND = 0; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + IO_URING = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -57,25 +59,26 @@ message fdinfo_entry { } message file_entry { - required fd_types type = 1; - required uint32 id = 2; - optional reg_file_entry reg = 3; - optional inet_sk_entry isk = 4; - optional ns_file_entry nsf = 5; - optional packet_sock_entry psk = 6; - optional netlink_sk_entry nlsk = 7; - optional eventfd_file_entry efd = 8; - optional eventpoll_file_entry epfd = 9; - optional signalfd_entry sgfd = 10; - optional tunfile_entry tunf = 11; - optional timerfd_entry tfd = 12; - optional inotify_file_entry ify = 13; - optional fanotify_file_entry ffy = 14; - optional ext_file_entry ext = 15; - optional unix_sk_entry usk = 16; - optional fifo_entry fifo = 17; - optional pipe_entry pipe = 18; - optional tty_file_entry tty = 19; - optional memfd_file_entry memfd = 20; - optional bpfmap_file_entry bpf = 21; + required fd_types type = 1; + required uint32 id = 2; + optional reg_file_entry reg = 3; + optional inet_sk_entry isk = 4; + optional ns_file_entry nsf = 5; + optional packet_sock_entry psk = 6; + optional netlink_sk_entry nlsk = 7; + optional eventfd_file_entry efd = 8; + optional eventpoll_file_entry epfd = 9; + optional signalfd_entry sgfd = 10; + optional tunfile_entry tunf = 11; + optional timerfd_entry tfd = 12; + optional inotify_file_entry ify = 13; + optional fanotify_file_entry ffy = 14; + optional ext_file_entry ext = 15; + optional unix_sk_entry usk = 16; + optional fifo_entry fifo = 17; + optional pipe_entry pipe = 18; + optional tty_file_entry tty = 19; + optional memfd_file_entry memfd = 20; + optional bpfmap_file_entry bpf = 21; + optional io_uring_file_entry io_uring = 22; } diff --git a/images/io_uring.proto b/images/io_uring.proto new file mode 100644 index 0000000000..cb933d0b56 --- /dev/null +++ b/images/io_uring.proto @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message io_uring_group_id { + required uint32 group = 1; + required string group_name = 2; +} + +message io_uring_personality_id { + required uint32 uid = 1; + required uint32 euid = 2; + required uint32 suid = 3; + required uint32 fsuid = 4; + required uint32 gid = 5; + required uint32 egid = 6; + required uint32 sgid = 7; + required uint32 fsgid = 8; + required string uid_name = 9; + required string euid_name = 10; + required string suid_name = 11; + required string fsuid_name = 12; + required string gid_name = 13; + required string egid_name = 14; + required string sgid_name = 15; + required string fsgid_name = 16; + required uint64 cap_eff = 17; + repeated io_uring_group_id group_id = 18; +} + +message io_uring_file_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).flags = "rfile.flags"]; + required uint64 pos = 3; + required fown_entry fown = 4; + /* Instance */ + required uint32 setup_flags = 5; + required uint32 sq_thread_cpu = 6; + required uint32 sq_thread_idle = 7; + required uint64 nr_user_bufs = 8; + required uint64 nr_user_files = 9; + required uint32 sq_entries = 10; + required uint32 cq_entries = 11; + required uint32 sq_off_array = 12; + required uint32 inode = 13; + /* Ring */ + required uint32 sq_head = 14; + required uint32 sq_tail = 15; + required uint32 cqe_head = 16; + required uint32 cqe_tail = 17; + required uint32 sq_ring_mask = 18; + required uint32 cq_ring_mask = 19; + /* Restrictions */ + required bool restrictions = 20; + required uint32 reg_op = 21; + required uint32 sqe_op = 22; + required uint32 sqe_flags_allowed = 23; + required uint32 sqe_flags_required = 24; + /* Personality */ + repeated io_uring_personality_id pers_id = 25; + optional sint32 mnt_id = 26 [default = -1]; +} + +message io_uring_data_entry { + required uint32 id = 1; + required uint32 sqe_bytes = 2; /* Bytes required for SQEs */ + required uint32 cqe_bytes = 3; /* Bytes required for CQEs */ + required uint32 sq_arr_bytes = 4; /* Bytes required for SQ array */ +} diff --git a/images/rpc.proto b/images/rpc.proto index a9f51ac4bf..1d3befd23c 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -172,6 +172,8 @@ enum criu_req_type { WAIT_PID = 11; PAGE_SERVER_CHLD = 12; + + SINGLE_PRE_DUMP = 13; } /* diff --git a/images/sk-opts.proto b/images/sk-opts.proto index 2377f6b629..1d24d47cc7 100644 --- a/images/sk-opts.proto +++ b/images/sk-opts.proto @@ -31,6 +31,8 @@ message sk_opts_entry { optional uint32 tcp_keepintvl = 22; optional uint32 so_oobinline = 23; optional uint32 so_linger = 24; + + optional uint32 so_buf_lock = 25; } enum sk_shutdown { diff --git a/include/common/arch/ppc64/asm/bitops.h b/include/common/arch/ppc64/asm/bitops.h index 704668263d..dbfa6be7f1 100644 --- a/include/common/arch/ppc64/asm/bitops.h +++ b/include/common/arch/ppc64/asm/bitops.h @@ -196,7 +196,7 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); diff --git a/include/common/arch/x86/asm/bitops.h b/include/common/arch/x86/asm/bitops.h index d7a60589b1..c13c1eb451 100644 --- a/include/common/arch/x86/asm/bitops.h +++ b/include/common/arch/x86/asm/bitops.h @@ -113,7 +113,7 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); diff --git a/include/common/asm-generic/bitops.h b/include/common/asm-generic/bitops.h index 064ba4cc47..004da4c4ed 100644 --- a/include/common/asm-generic/bitops.h +++ b/include/common/asm-generic/bitops.h @@ -97,7 +97,7 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); diff --git a/lib/c/criu.c b/lib/c/criu.c index ddc6e0731f..dea5896f7b 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -238,6 +238,7 @@ void criu_local_free_opts(criu_opts *opts) free(opts->rpc->freeze_cgroup); free(opts->rpc->log_file); free(opts->rpc->lsm_profile); + free(opts->rpc->lsm_mount_context); free(opts->rpc); criu_free_service(opts); free(opts); @@ -651,6 +652,20 @@ int criu_set_lsm_profile(const char *name) return criu_local_set_lsm_profile(global_opts, name); } +int criu_local_set_lsm_mount_context(criu_opts *opts, const char *name) +{ + opts->rpc->lsm_mount_context = strdup(name); + if (opts->rpc->lsm_mount_context == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_lsm_mount_context(const char *name) +{ + return criu_local_set_lsm_mount_context(global_opts, name); +} + void criu_local_set_timeout(criu_opts *opts, unsigned int timeout) { opts->rpc->timeout = timeout; @@ -1512,7 +1527,7 @@ int criu_check(void) return criu_local_check(global_opts); } -int criu_local_dump(criu_opts *opts) +static int dump(bool pre_dump, criu_opts *opts) { int ret = -1; CriuReq req = CRIU_REQ__INIT; @@ -1520,7 +1535,7 @@ int criu_local_dump(criu_opts *opts) saved_errno = 0; - req.type = CRIU_REQ_TYPE__DUMP; + req.type = pre_dump ? CRIU_REQ_TYPE__SINGLE_PRE_DUMP : CRIU_REQ_TYPE__DUMP; req.opts = opts->rpc; ret = send_req_and_recv_resp(opts, &req, &resp); @@ -1528,7 +1543,7 @@ int criu_local_dump(criu_opts *opts) goto exit; if (resp->success) { - if (resp->dump->has_restored && resp->dump->restored) + if (!pre_dump && resp->dump->has_restored && resp->dump->restored) ret = 1; else ret = 0; @@ -1546,11 +1561,26 @@ int criu_local_dump(criu_opts *opts) return ret; } +int criu_local_dump(criu_opts *opts) +{ + return dump(false, opts); +} + int criu_dump(void) { return criu_local_dump(global_opts); } +int criu_local_pre_dump(criu_opts *opts) +{ + return dump(true, opts); +} + +int criu_pre_dump(void) +{ + return criu_local_pre_dump(global_opts); +} + int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)) { int ret = -1, fd = -1, uret; @@ -1895,3 +1925,75 @@ int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt) { return criu_local_join_ns_add(global_opts, ns, ns_file, extra_opt); } + +int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size) +{ + CriuFeatures criu_features = CRIU_FEATURES__INIT; + struct criu_feature_check features_copy = { 0 }; + CriuReq req = CRIU_REQ__INIT; + CriuResp *resp = NULL; + int ret = -1; + + saved_errno = 0; + + if (!features) + goto exit; + + if (size > sizeof(struct criu_feature_check)) + goto exit; + + memcpy(&features_copy, features, size); + + req.type = CRIU_REQ_TYPE__FEATURE_CHECK; + req.opts = opts->rpc; + + if (features_copy.mem_track) { + criu_features.has_mem_track = true; + criu_features.mem_track = true; + } + if (features_copy.lazy_pages) { + criu_features.has_lazy_pages = true; + criu_features.lazy_pages = true; + } + if (features_copy.pidfd_store) { + criu_features.has_pidfd_store = true; + criu_features.pidfd_store = true; + } + req.features = &criu_features; + + ret = send_req_and_recv_resp(opts, &req, &resp); + if (ret) + goto exit; + + memset(&features_copy, 0, sizeof(struct criu_feature_check)); + + if (resp->success) { + if (resp->features->has_mem_track) { + features_copy.mem_track = resp->features->mem_track; + } + if (resp->features->has_lazy_pages) { + features_copy.lazy_pages = resp->features->lazy_pages; + } + if (resp->features->has_pidfd_store) { + features_copy.pidfd_store = resp->features->pidfd_store; + } + memcpy(features, &features_copy, size); + } else { + ret = -EBADE; + } + +exit: + if (resp) + criu_resp__free_unpacked(resp, NULL); + + swrk_wait(opts); + + errno = saved_errno; + + return ret; +} + +int criu_feature_check(struct criu_feature_check *features, size_t size) +{ + return criu_local_feature_check(global_opts, features, size); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index 949902f559..aed2c34813 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -92,6 +92,7 @@ void criu_set_manage_cgroups(bool manage); void criu_set_manage_cgroups_mode(enum criu_cg_mode mode); int criu_set_freeze_cgroup(const char *name); int criu_set_lsm_profile(const char *name); +int criu_set_lsm_mount_context(const char *name); void criu_set_timeout(unsigned int timeout); void criu_set_auto_ext_mnt(bool val); void criu_set_ext_sharing(bool val); @@ -160,6 +161,7 @@ int criu_get_orphan_pts_master_fd(void); */ int criu_check(void); int criu_dump(void); +int criu_pre_dump(void); int criu_restore(void); int criu_restore_child(void); @@ -249,6 +251,7 @@ void criu_local_set_manage_cgroups(criu_opts *opts, bool manage); void criu_local_set_manage_cgroups_mode(criu_opts *opts, enum criu_cg_mode mode); int criu_local_set_freeze_cgroup(criu_opts *opts, const char *name); int criu_local_set_lsm_profile(criu_opts *opts, const char *name); +int criu_local_set_lsm_mount_context(criu_opts *opts, const char *name); void criu_local_set_timeout(criu_opts *opts, unsigned int timeout); void criu_local_set_auto_ext_mnt(criu_opts *opts, bool val); void criu_local_set_ext_sharing(criu_opts *opts, bool val); @@ -277,6 +280,7 @@ void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_noti int criu_local_check(criu_opts *opts); int criu_local_dump(criu_opts *opts); +int criu_local_pre_dump(criu_opts *opts); int criu_local_restore(criu_opts *opts); int criu_local_restore_child(criu_opts *opts); int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)); @@ -284,6 +288,35 @@ int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)); int criu_local_get_version(criu_opts *opts); int criu_local_check_version(criu_opts *opts, int minimum); +/* + * Feature checking allows the user to check if CRIU supports + * certain features. There are CRIU features which do not depend + * on the version of CRIU but on kernel features or architecture. + * + * One example is memory tracking. Memory tracking can be disabled + * in the kernel or there are architectures which do not support + * it (aarch64 for example). By using the feature check a libcriu + * user can easily query CRIU if a certain feature is available. + * + * The features which should be checked can be marked in the + * structure 'struct criu_feature_check'. Each structure member + * that is set to true will result in CRIU checking for the + * availability of that feature in the current combination of + * CRIU/kernel/architecture. + * + * Available features will be set to true when the function + * returns successfully. Missing features will be set to false. + */ + +struct criu_feature_check { + bool mem_track; + bool lazy_pages; + bool pidfd_store; +}; + +int criu_feature_check(struct criu_feature_check *features, size_t size); +int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size); + #ifdef __GNUG__ } #endif diff --git a/lib/py/images/images.py b/lib/py/images/images.py index 300b1cc69a..3b72edf42e 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -384,6 +384,21 @@ def skip(self, f, pload): f.seek(pload.bytes, os.SEEK_CUR) return pload.bytes +class io_uring_data_extra_handler: + def load(self, f, pload): + size = pload.sqe_bytes + pload.cqe_bytes + pload.sq_arr_bytes + data = f.read(size) + return base64.encodebytes(data).decode('utf-8') + + def dump(self, extra, f, pload): + data = base64.decodebytes(extra) + f.write(data) + + def skip(self, f, pload): + size = pload.sqe_bytes + pload.cqe_bytes + pload.sq_arr_bytes + f.seek(size, os.SEEK_CUR) + return size + class ipc_sem_set_handler: def load(self, f, pbuff): entry = pb2dict.pb2dict(pbuff) @@ -562,6 +577,9 @@ def skip(self, f, pbuff): 'BPFMAP_DATA': entry_handler(pb.bpfmap_data_entry, bpfmap_data_extra_handler()), 'APPARMOR': entry_handler(pb.apparmor_entry), + 'IO_URING_FILE': entry_handler(pb.io_uring_file_entry), + 'IO_URING_DATA': entry_handler(pb.io_uring_data_entry, + io_uring_data_extra_handler()), } diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile new file mode 100644 index 0000000000..45a9ec611f --- /dev/null +++ b/plugins/amdgpu/Makefile @@ -0,0 +1,13 @@ +all: dummy_plugin.so + +dummy_plugin.so: dummy_plugin.c + gcc -g -Werror -D _GNU_SOURCE -Wall -shared -nostartfiles dummy_plugin.c -o dummy_plugin.so -iquote ../../../criu/include -iquote ../../criu/include -fPIC + +clean: + $(Q) $(RM) dummy_plugin.so +install: + $(Q) mkdir -p $(PLUGINDIR) + $(Q) install -m 644 dummy_plugin.so $(PLUGINDIR) + +uninstall: + $(Q) $(RM) $(PLUGINDIR)/dummy_plugin.so diff --git a/plugins/amdgpu/dummy_plugin.c b/plugins/amdgpu/dummy_plugin.c new file mode 100644 index 0000000000..8722760950 --- /dev/null +++ b/plugins/amdgpu/dummy_plugin.c @@ -0,0 +1,36 @@ +#include + +#include "criu-log.h" +#include "criu-plugin.h" + +int dummy_plugin_handle_device_vma(int fd, const struct stat *stat) +{ + pr_info("dummy_plugin: Inside %s for fd = %d\n", __func__, fd); + /* let criu report failure for the unsupported mapping */ + return -ENOTSUP; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, dummy_plugin_handle_device_vma) + +int dummy_plugin_resume_devices_late(int target_pid) +{ + pr_info("dummy_plugin: Inside %s for target pid = %d\n", __func__, target_pid); + return -ENOTSUP; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, dummy_plugin_resume_devices_late) + +/* + * return 0 if no match found + * return -1 for error or -ENOTSUP. + * return 1 if vmap map must be adjusted. + */ +int dummy_plugin_update_vmamap(const char *old_path, char *new_path, const uint64_t addr, const uint64_t old_offset, + uint64_t *new_offset) +{ + uint64_t temp = 100; + + *new_offset = temp; + pr_info("dummy_plugin: old_pgoff= 0x%lu new_pgoff = 0x%lx old_path = %s new_path = %s addr = 0x%lu\n", + old_offset, *new_offset, old_path, new_path, addr); + return -ENOTSUP; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, dummy_plugin_update_vmamap) diff --git a/scripts/build/Dockerfile.aarch64-cross.tmpl b/scripts/build/Dockerfile.aarch64-cross.tmpl deleted file mode 120000 index 50eff9213e..0000000000 --- a/scripts/build/Dockerfile.aarch64-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.aarch64-cross.hdr b/scripts/build/Dockerfile.aarch64-stable-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.aarch64-cross.hdr rename to scripts/build/Dockerfile.aarch64-stable-cross.hdr diff --git a/scripts/build/Dockerfile.aarch64-stable-cross.tmpl b/scripts/build/Dockerfile.aarch64-stable-cross.tmpl new file mode 120000 index 0000000000..81ef22980f --- /dev/null +++ b/scripts/build/Dockerfile.aarch64-stable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.aarch64-unstable-cross.hdr b/scripts/build/Dockerfile.aarch64-unstable-cross.hdr new file mode 100644 index 0000000000..c61d2af276 --- /dev/null +++ b/scripts/build/Dockerfile.aarch64-unstable-cross.hdr @@ -0,0 +1,5 @@ +FROM docker.io/dockcross/base:latest + +ENV ARCH=aarch64 +ENV DEBIAN_ARCH=arm64 +ENV CROSS_TRIPLET=aarch64-linux-gnu diff --git a/scripts/build/Dockerfile.aarch64-unstable-cross.tmpl b/scripts/build/Dockerfile.aarch64-unstable-cross.tmpl new file mode 120000 index 0000000000..955ae1fd4b --- /dev/null +++ b/scripts/build/Dockerfile.aarch64-unstable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index a6579c0bbf..cab72e8a18 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -6,6 +6,7 @@ RUN apk update && apk add \ bash \ build-base \ coreutils \ + procps \ git \ gnutls-dev \ libaio-dev \ diff --git a/scripts/build/Dockerfile.armv7-cross.tmpl b/scripts/build/Dockerfile.armv7-cross.tmpl deleted file mode 120000 index 50eff9213e..0000000000 --- a/scripts/build/Dockerfile.armv7-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.armv7-cross.hdr b/scripts/build/Dockerfile.armv7-stable-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.armv7-cross.hdr rename to scripts/build/Dockerfile.armv7-stable-cross.hdr diff --git a/scripts/build/Dockerfile.armv7-stable-cross.tmpl b/scripts/build/Dockerfile.armv7-stable-cross.tmpl new file mode 120000 index 0000000000..81ef22980f --- /dev/null +++ b/scripts/build/Dockerfile.armv7-stable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.armv7-unstable-cross.hdr b/scripts/build/Dockerfile.armv7-unstable-cross.hdr new file mode 100644 index 0000000000..f96dc51f70 --- /dev/null +++ b/scripts/build/Dockerfile.armv7-unstable-cross.hdr @@ -0,0 +1,6 @@ +FROM docker.io/dockcross/base:latest + +ENV ARCH=arm +ENV SUBARCH=armv7 +ENV DEBIAN_ARCH=armhf +ENV CROSS_TRIPLET=arm-linux-gnueabihf diff --git a/scripts/build/Dockerfile.armv7-unstable-cross.tmpl b/scripts/build/Dockerfile.armv7-unstable-cross.tmpl new file mode 120000 index 0000000000..955ae1fd4b --- /dev/null +++ b/scripts/build/Dockerfile.armv7-unstable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 9d3bb0f879..fd4ba4aefe 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,5 +1,8 @@ ARG CC=gcc +# FIXME: Temporary solution for https://github.com/checkpoint-restore/criu/issues/1696 +ENV GLIBC_TUNABLES=glibc.pthread.rseq=0 + COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh RUN /bin/prepare-for-fedora-rawhide.sh diff --git a/scripts/build/Dockerfile.mips64el-cross.tmpl b/scripts/build/Dockerfile.mips64el-cross.tmpl deleted file mode 120000 index 50eff9213e..0000000000 --- a/scripts/build/Dockerfile.mips64el-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.mips64el-cross.hdr b/scripts/build/Dockerfile.mips64el-stable-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.mips64el-cross.hdr rename to scripts/build/Dockerfile.mips64el-stable-cross.hdr diff --git a/scripts/build/Dockerfile.mips64el-stable-cross.tmpl b/scripts/build/Dockerfile.mips64el-stable-cross.tmpl new file mode 120000 index 0000000000..81ef22980f --- /dev/null +++ b/scripts/build/Dockerfile.mips64el-stable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.mips64el-unstable-cross.hdr b/scripts/build/Dockerfile.mips64el-unstable-cross.hdr new file mode 100644 index 0000000000..e78c94aa56 --- /dev/null +++ b/scripts/build/Dockerfile.mips64el-unstable-cross.hdr @@ -0,0 +1,6 @@ +FROM dockcross/base:latest + +ENV ARCH=mips +ENV SUBARCH=mips +ENV DEBIAN_ARCH=mips64el +ENV CROSS_TRIPLET=mips64el-linux-gnuabi64 diff --git a/scripts/build/Dockerfile.mips64el-unstable-cross.tmpl b/scripts/build/Dockerfile.mips64el-unstable-cross.tmpl new file mode 120000 index 0000000000..955ae1fd4b --- /dev/null +++ b/scripts/build/Dockerfile.mips64el-unstable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.ppc64-cross.tmpl b/scripts/build/Dockerfile.ppc64-cross.tmpl deleted file mode 120000 index 50eff9213e..0000000000 --- a/scripts/build/Dockerfile.ppc64-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.ppc64-cross.hdr b/scripts/build/Dockerfile.ppc64-stable-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.ppc64-cross.hdr rename to scripts/build/Dockerfile.ppc64-stable-cross.hdr diff --git a/scripts/build/Dockerfile.ppc64-stable-cross.tmpl b/scripts/build/Dockerfile.ppc64-stable-cross.tmpl new file mode 120000 index 0000000000..81ef22980f --- /dev/null +++ b/scripts/build/Dockerfile.ppc64-stable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.ppc64-unstable-cross.hdr b/scripts/build/Dockerfile.ppc64-unstable-cross.hdr new file mode 100644 index 0000000000..38547ac557 --- /dev/null +++ b/scripts/build/Dockerfile.ppc64-unstable-cross.hdr @@ -0,0 +1,5 @@ +FROM dockcross/base:latest + +ENV ARCH=ppc64 +ENV DEBIAN_ARCH=ppc64el +ENV CROSS_TRIPLET=powerpc64le-linux-gnu diff --git a/scripts/build/Dockerfile.ppc64-unstable-cross.tmpl b/scripts/build/Dockerfile.ppc64-unstable-cross.tmpl new file mode 120000 index 0000000000..955ae1fd4b --- /dev/null +++ b/scripts/build/Dockerfile.ppc64-unstable-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl similarity index 88% rename from scripts/build/Dockerfile.cross.tmpl rename to scripts/build/Dockerfile.stable-cross.tmpl index 8b95fbb1c0..6a68cd1ca6 100644 --- a/scripts/build/Dockerfile.cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -1,9 +1,8 @@ COPY scripts/ci/apt-install /bin/apt-install # Add the cross compiler sources -RUN echo "deb http://deb.debian.org/debian/ buster main" >> /etc/apt/sources.list && \ - dpkg --add-architecture ${DEBIAN_ARCH} && \ - apt-install emdebian-archive-keyring +RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ + dpkg --add-architecture ${DEBIAN_ARCH} RUN apt-install \ crossbuild-essential-${DEBIAN_ARCH} \ diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl new file mode 100644 index 0000000000..dacfd96ef0 --- /dev/null +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -0,0 +1,42 @@ +COPY scripts/ci/apt-install /bin/apt-install + +# Add the cross compiler sources +RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ + dpkg --add-architecture ${DEBIAN_ARCH} + +RUN apt-install \ + crossbuild-essential-${DEBIAN_ARCH} \ + libc6-dev-${DEBIAN_ARCH}-cross \ + libc6-${DEBIAN_ARCH}-cross \ + libbz2-dev:${DEBIAN_ARCH} \ + libexpat1-dev:${DEBIAN_ARCH} \ + ncurses-dev:${DEBIAN_ARCH} \ + libssl-dev:${DEBIAN_ARCH} \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf \ + libnl-3-dev:${DEBIAN_ARCH} \ + libprotobuf-dev:${DEBIAN_ARCH} \ + libnet-dev:${DEBIAN_ARCH} \ + libprotobuf-c-dev:${DEBIAN_ARCH} \ + libcap-dev:${DEBIAN_ARCH} \ + libaio-dev:${DEBIAN_ARCH} \ + libnl-route-3-dev:${DEBIAN_ARCH} + +ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLET} \ + AS=/usr/bin/${CROSS_TRIPLET}-as \ + AR=/usr/bin/${CROSS_TRIPLET}-ar \ + CC=/usr/bin/${CROSS_TRIPLET}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ + LD=/usr/bin/${CROSS_TRIPLET}-ld \ + FC=/usr/bin/${CROSS_TRIPLET}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 62e3a99204..2c006ad873 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,5 +1,7 @@ ARCHES := x86_64 fedora-asan fedora-rawhide centos7 armv7hf centos8 -NON_CLANG := armv7-cross aarch64-cross ppc64-cross mips64el-cross +STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross +UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross +NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) CREATE_DOCKERFILES := $(ARCHES) $(NON_CLANG) TARGETS := $(ARCHES) alpine archlinux TARGETS_CLANG := $(addsuffix $(TARGETS),-clang) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 7c66e68023..d0cd55f7c6 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -62,11 +62,9 @@ ci_prep () { } test_stream() { - # We must test CRIU features that dump content into an image file to ensure - # streaming compatibility. - STREAM_TEST_PATTERN='.*(ghost|fifo|unlink|memfd|shmem|socket_queue).*' + # Testing CRIU streaming to criu-image-streamer # shellcheck disable=SC2086 - ./test/zdtm.py run --stream -p 2 --keep-going -T "$STREAM_TEST_PATTERN" $ZDTM_OPTS + ./test/zdtm.py run --stream -p 2 --keep-going -a $ZDTM_OPTS } print_header() { @@ -197,6 +195,12 @@ fi # shellcheck disable=SC2086 ./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS +# Newer kernels are blocking access to userfaultfd: +# uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults must be handled without obtaining CAP_SYS_PTRACE capability +if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then + echo 1 > /proc/sys/vm/unprivileged_userfaultfd +fi + LAZY_EXCLUDE="-x maps04 -x cmdlinenv00 -x maps007" LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*' @@ -206,10 +210,8 @@ LAZY_OPTS="-p 2 -T $LAZY_TESTS $LAZY_EXCLUDE $ZDTM_OPTS" ./test/zdtm.py run $LAZY_OPTS --lazy-pages # shellcheck disable=SC2086 ./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages -# FIXME: post-copy migration of THP over TLS (sometimes) fails with: -# Error (criu/tls.c:321): tls: Pull callback recv failed: Connection reset by peer # shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls -x lazy-thp +./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls bash -x ./test/jenkins/criu-fault.sh if [ "$UNAME_M" == "x86_64" ]; then @@ -229,7 +231,12 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi -make -C test/others/rpc/ run + +# FIXME: rpc tests fail even with set glibc tunable +# https://github.com/checkpoint-restore/criu/issues/1696 +if [ "$GLIBC_TUNABLES" != "glibc.pthread.rseq=0" ]; then + make -C test/others/rpc/ run +fi ./test/zdtm.py run -t zdtm/static/env00 --sibling @@ -259,6 +266,9 @@ ip net add test # more crit testing make -C test/others/crit run +# coredump testing +make -C test/others/criu-coredump run + # libcriu testing make -C test/others/libcriu run diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 839b100c80..4a4a164456 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -1,14 +1,14 @@ #!/bin/bash -# This script is used to run vagrant based tests on Travis. -# This script is started via sudo from .travis.yml +# This script is used to run vagrant based tests on Cirrus CI. +# This script is started via .cirrus.yml set -e set -x -VAGRANT_VERSION=2.2.16 -FEDORA_VERSION=34 -FEDORA_BOX_VERSION=34.20210423.0 +VAGRANT_VERSION=2.2.19 +FEDORA_VERSION=35 +FEDORA_BOX_VERSION=35.20211026.0 setup() { if [ -n "$TRAVIS" ]; then @@ -50,9 +50,7 @@ fedora-no-vdso() { vagrant reload ssh default cat /proc/cmdline ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' - # BPF tests are failing see: https://github.com/checkpoint-restore/criu/issues/1354 - # Needs to be fixed, skip for now - ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going -x zdtm/static/bpf_hash -x zdtm/static/bpf_array' + ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2' diff --git a/scripts/criu-ns b/scripts/criu-ns index d76db3606d..72c0753e5e 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -102,6 +102,7 @@ def wrap_restore(): criu_pid = os.fork() if criu_pid == 0: + os.setsid() _mount_new_proc() run_criu(restore_args) diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index c9006c518b..0e9545f2dc 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -12,4 +12,5 @@ curl -s "${URL}" | sed -e " s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; s,\(AlignTrailingComments:.*\)$,\1\nAlignConsecutiveMacros: true,g; + s,AlignTrailingComments: false,AlignTrailingComments: true,g; " > .clang-format diff --git a/scripts/flake8.cfg b/scripts/flake8.cfg index b6a5877299..bd4f95bb20 100644 --- a/scripts/flake8.cfg +++ b/scripts/flake8.cfg @@ -2,3 +2,5 @@ # E501 line too long # W504 line break after binary operator ignore = E501,W504 +# F401: imported but unused +per-file-ignores = __init__.py:F401 diff --git a/soccr/soccr.c b/soccr/soccr.c index f6fb1946b7..8be2d28e15 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -609,8 +609,8 @@ static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsig libnet_type = LIBNET_RAW4; l = libnet_init(libnet_type, /* injection type */ - NULL, /* network interface */ - errbuf); /* errbuf */ + NULL, /* network interface */ + errbuf); /* errbuf */ if (l == NULL) { loge("libnet_init failed (%s)\n", errbuf); return -1; @@ -623,17 +623,17 @@ static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsig ret = libnet_build_tcp(ntohs(sk->dst_addr->v4.sin_port), /* source port */ ntohs(sk->src_addr->v4.sin_port), /* destination port */ - data->inq_seq, /* sequence number */ - data->outq_seq - data->outq_len, /* acknowledgement num */ - flags, /* control flags */ - data->rcv_wnd, /* window size */ - 0, /* checksum */ - 10, /* urgent pointer */ - LIBNET_TCP_H + 20, /* TCP packet size */ - NULL, /* payload */ - 0, /* payload size */ - l, /* libnet handle */ - 0); /* libnet id */ + data->inq_seq, /* sequence number */ + data->outq_seq - data->outq_len, /* acknowledgement num */ + flags, /* control flags */ + data->rcv_wnd, /* window size */ + 0, /* checksum */ + 10, /* urgent pointer */ + LIBNET_TCP_H + 20, /* TCP packet size */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ if (ret == -1) { loge("Can't build TCP header: %s\n", libnet_geterror(l)); goto err; @@ -646,28 +646,28 @@ static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsig memcpy(&src, &sk->src_addr->v6.sin6_addr, sizeof(src)); ret = libnet_build_ipv6(0, 0, LIBNET_TCP_H, /* length */ - IPPROTO_TCP, /* protocol */ - 64, /* hop limit */ - dst, /* source IP */ - src, /* destination IP */ - NULL, /* payload */ - 0, /* payload size */ - l, /* libnet handle */ - 0); /* libnet id */ + IPPROTO_TCP, /* protocol */ + 64, /* hop limit */ + dst, /* source IP */ + src, /* destination IP */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ } else if (family == AF_INET) ret = libnet_build_ipv4(LIBNET_IPV4_H + LIBNET_TCP_H + 20, /* length */ - 0, /* TOS */ - 242, /* IP ID */ - 0, /* IP Frag */ - 64, /* TTL */ - IPPROTO_TCP, /* protocol */ - 0, /* checksum */ - dst_v4, /* source IP */ - src_v4, /* destination IP */ - NULL, /* payload */ - 0, /* payload size */ - l, /* libnet handle */ - 0); /* libnet id */ + 0, /* TOS */ + 242, /* IP ID */ + 0, /* IP Frag */ + 64, /* TTL */ + IPPROTO_TCP, /* protocol */ + 0, /* checksum */ + dst_v4, /* source IP */ + src_v4, /* destination IP */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ else { loge("Unknown socket family\n"); goto err; diff --git a/soccr/soccr.h b/soccr/soccr.h index 934d438277..e7091e5918 100644 --- a/soccr/soccr.h +++ b/soccr/soccr.h @@ -1,9 +1,9 @@ #ifndef __LIBSOCCR_H__ #define __LIBSOCCR_H__ -#include /* sockaddr_in, sockaddr_in6 */ +#include /* sockaddr_in, sockaddr_in6 */ #include /* TCP_REPAIR_WINDOW, TCP_TIMESTAMP */ -#include /* uint32_t */ -#include /* sockaddr */ +#include /* uint32_t */ +#include /* sockaddr */ #include "common/config.h" diff --git a/test/exhaustive/pipe.py b/test/exhaustive/pipe.py index fdadc480c8..7f1c53d34b 100755 --- a/test/exhaustive/pipe.py +++ b/test/exhaustive/pipe.py @@ -75,7 +75,7 @@ def get_pipe_rw(pid, fd): def check_pipe_y(pid, fd, rw, inos): ino = get_pipe_ino(pid, fd) - if ino == None: + if ino is None: return 'missing ' if not inos.has_key(fd): inos[fd] = ino @@ -89,7 +89,7 @@ def check_pipe_y(pid, fd, rw, inos): def check_pipe_n(pid, fd): ino = get_pipe_ino(pid, fd) - if ino == None: + if ino is None: return None else: return 'present ' @@ -102,7 +102,7 @@ def check_pipe_end(kids, fd, comb, rw, inos): res = check_pipe_y(t_pid, fd, rw, inos) else: res = check_pipe_n(t_pid, fd) - if res != None: + if res is not None: return res + 'kid(%d)' % t_nr t_nr += 1 return None @@ -111,7 +111,7 @@ def check_pipe_end(kids, fd, comb, rw, inos): def check_pipe(kids, fds, comb, inos): for e in (0, 1): # 0 == R, 1 == W, see get_pipe_rw() res = check_pipe_end(kids, fds[e], comb[e], e, inos) - if res != None: + if res is not None: return res + 'end(%d)' % e return None @@ -124,7 +124,7 @@ def check_pipes(kids, pipes, comb): p_inos = {} for p_fds in pipes: res = check_pipe(kids, p_fds, comb[p_nr], p_inos) - if res != None: + if res is not None: return res + 'pipe(%d)' % p_nr p_nr += 1 @@ -182,7 +182,7 @@ def make_comb(comb, opts, status_pipe): if v == '0': print('\tCheck pipes') res = check_pipes(kids, pipes, comb) - if res == None: + if res is None: ex_code = 0 else: print('\tFAIL %s' % res) diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 98dbbb7b0b..114bf957b9 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -304,7 +304,7 @@ def get_dgram_actions(self, st): for psk in st.sockets: if psk == self: continue - if psk.peer != None and psk.peer != self.sk_id: + if psk.peer is not None and psk.peer != self.sk_id: # Peer by someone else, can do nothing continue diff --git a/test/jenkins/criu-dedup.sh b/test/jenkins/criu-dedup.sh index 0041496d80..edb1b653d1 100755 --- a/test/jenkins/criu-dedup.sh +++ b/test/jenkins/criu-dedup.sh @@ -9,8 +9,8 @@ prep # Additionally run these tests as they touch a lot of # memory and it makes sense to additionally check it # with delays between iterations -./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --dedup || fail -./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --dedup || fail -./test/zdtm.py run -t zdtm/transition/maps008 --keep-going --report report -f h --pre 8:.1 --dedup || fail -./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail -./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail +./test/zdtm.py run -t zdtm/transition/maps007 --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/static/mem-touch --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/transition/maps008 --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/transition/maps007 --report report -f h --pre 8:.1 --noauto-dedup || fail +./test/zdtm.py run -t zdtm/static/mem-touch --report report -f h --pre 8:.1 --noauto-dedup || fail diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index a8c3a5cf7f..9f20091ccc 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -4,10 +4,10 @@ set -e source `dirname $0`/criu-lib.sh prep -./test/zdtm.py run -t zdtm/static/env00 --fault 1 --keep-going --report report -f h || fail -./test/zdtm.py run -t zdtm/static/unlink_fstat00 --fault 2 --keep-going --report report -f h || fail -./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --keep-going --report report -f h || fail -./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --keep-going --pre 2 -f uns || fail +./test/zdtm.py run -t zdtm/static/env00 --fault 1 --report report -f h || fail +./test/zdtm.py run -t zdtm/static/unlink_fstat00 --fault 2 --report report -f h || fail +./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail +./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail ./test/zdtm.py run -t zdtm/static/env00 --fault 129 -f uns || fail ./test/zdtm.py run -t zdtm/transition/fork --fault 130 -f h || fail ./test/zdtm.py run -t zdtm/static/vdso01 --fault 127 || fail @@ -17,16 +17,16 @@ if [ "${COMPAT_TEST}" != "y" ] ; then ./test/zdtm.py run -t zdtm/static/vdso01 --fault 133 -f h || fail fi -./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 2 --keep-going --report report || fail -./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 4 --keep-going --report report || fail +./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 2 --report report || fail +./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 4 --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 6 --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_link_remap --fault 6 --report report || fail ./test/zdtm.py run -t zdtm/static/unlink_fstat03 --fault 6 --report report || fail -./test/zdtm.py run -t zdtm/static/env00 --fault 5 --keep-going --report report || fail -./test/zdtm.py run -t zdtm/static/maps04 --fault 131 --keep-going --report report --pre 2:1 || fail -./test/zdtm.py run -t zdtm/transition/maps008 --fault 131 --keep-going --report report --pre 2:1 || fail +./test/zdtm.py run -t zdtm/static/env00 --fault 5 --report report || fail +./test/zdtm.py run -t zdtm/static/maps04 --fault 131 --report report --pre 2:1 || fail +./test/zdtm.py run -t zdtm/transition/maps008 --fault 131 --report report --pre 2:1 || fail ./test/zdtm.py run -t zdtm/static/maps01 --fault 132 -f h || fail # 134 is corrupting extended registers set, should run in a sub-thread (fpu03) # without restore (that will check if parasite corrupts extended registers) diff --git a/test/jenkins/criu-fcg.sh b/test/jenkins/criu-fcg.sh index ca5054f5e5..81395b7ba6 100755 --- a/test/jenkins/criu-fcg.sh +++ b/test/jenkins/criu-fcg.sh @@ -6,10 +6,10 @@ source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --pre 3 || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --norst || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:f || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:f --pre 3 || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:f --norst || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --pre 3 || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --norst || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:t || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:t --pre 3 || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:t --norst || fail diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index 62d9f7edc4..dd774e298b 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -1,4 +1,8 @@ -source ../env.sh +#!/bin/bash + +set -x +# shellcheck disable=SC1091 +source ../env.sh || exit 1 function gen_imgs { PID=$(../loop) @@ -9,7 +13,7 @@ function gen_imgs { exit 1 fi - images_list=$(ls -1 *.img) + images_list=$(ls -1 ./*.img) if [ -z "$images_list" ]; then echo "Failed to generate images" exit 1 @@ -32,7 +36,7 @@ function run_test { for x in $cores do echo "=== try readelf $x" - readelf -a $x || exit $? + readelf -a "$x" || exit $? echo "=== done" done diff --git a/test/others/env.sh b/test/others/env.sh index b514e87d9e..45066f760b 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -13,5 +13,5 @@ fi #export PYTHON CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu-coredump/criu-coredump) +CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump-"${PYTHON}") criu_coredump=$CRIU_COREDUMP diff --git a/test/others/libcriu/.gitignore b/test/others/libcriu/.gitignore index cf1342de2a..0f6e52bb4e 100644 --- a/test/others/libcriu/.gitignore +++ b/test/others/libcriu/.gitignore @@ -4,5 +4,7 @@ test_notify test_self test_sub test_join_ns +test_pre_dump +test_feature_check output/ libcriu.so.* diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index 734e66c1a9..ae73305331 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -6,6 +6,8 @@ TESTS += test_notify TESTS += test_iters TESTS += test_errno TESTS += test_join_ns +TESTS += test_pre_dump +TESTS += test_feature_check all: $(TESTS) .PHONY: all diff --git a/test/others/libcriu/lib.h b/test/others/libcriu/lib.h index 6fdf8aef22..59372fca5f 100644 --- a/test/others/libcriu/lib.h +++ b/test/others/libcriu/lib.h @@ -1,3 +1,5 @@ void what_err_ret_mean(int ret); int chk_exit(int status, int want); int get_version(void); + +#define SUCC_ECODE 42 diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 48f25a5f6d..77bdfb87eb 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -58,9 +58,20 @@ run_test test_notify if [ "$(uname -m)" = "x86_64" ]; then # Skip this on aarch64 as aarch64 has no dirty page tracking run_test test_iters + run_test test_pre_dump fi run_test test_errno run_test test_join_ns +if criu check --feature mem_dirty_track > /dev/null; then + export CRIU_FEATURE_MEM_TRACK=1 +fi +if criu check --feature uffd-noncoop > /dev/null; then + export CRIU_FEATURE_LAZY_PAGES=1 +fi +if criu check --feature pidfd_store > /dev/null; then + export CRIU_FEATURE_PIDFD_STORE=1 +fi +run_test test_feature_check echo "== Tests done" make libcriu_clean diff --git a/test/others/libcriu/test_feature_check.c b/test/others/libcriu/test_feature_check.c new file mode 100644 index 0000000000..d88e0de230 --- /dev/null +++ b/test/others/libcriu/test_feature_check.c @@ -0,0 +1,65 @@ +#include "criu.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib.h" + +int main(int argc, char **argv) +{ + int ret; + char *env; + bool mem_track = 0; + bool lazy_pages = 0; + bool pidfd_store = 0; + struct criu_feature_check features = { + .mem_track = true, + .lazy_pages = true, + .pidfd_store = true, + }; + + printf("--- Start feature check ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + + env = getenv("CRIU_FEATURE_MEM_TRACK"); + if (env) { + mem_track = true; + } + env = getenv("CRIU_FEATURE_LAZY_PAGES"); + if (env) { + lazy_pages = true; + } + env = getenv("CRIU_FEATURE_PIDFD_STORE"); + if (env) { + pidfd_store = true; + } + + ret = criu_feature_check(&features, sizeof(features) + 1); + printf(" `- passing too large structure to libcriu should return -1: %d\n", ret); + if (ret != -1) + return -1; + + ret = criu_feature_check(&features, sizeof(features)); + if (ret < 0) { + what_err_ret_mean(ret); + return ret; + } + + printf(" `- mem_track : %d - expected : %d\n", features.mem_track, mem_track); + if (features.mem_track != mem_track) + return -1; + printf(" `- lazy_pages : %d - expected : %d\n", features.lazy_pages, lazy_pages); + if (features.lazy_pages != lazy_pages) + return -1; + printf(" `- pidfd_store: %d - expected : %d\n", features.pidfd_store, pidfd_store); + if (features.pidfd_store != pidfd_store) + return -1; + + return 0; +} diff --git a/test/others/libcriu/test_iters.c b/test/others/libcriu/test_iters.c index b7e325abb4..edbaf87f6f 100644 --- a/test/others/libcriu/test_iters.c +++ b/test/others/libcriu/test_iters.c @@ -46,8 +46,6 @@ static int next_iter(criu_predump_info pi) return cur_iter < MAX_ITERS; } -#define SUCC_ECODE 42 - int main(int argc, char **argv) { int pid, ret, p[2]; diff --git a/test/others/libcriu/test_notify.c b/test/others/libcriu/test_notify.c index 9a54b812a0..80ad3ffdcb 100644 --- a/test/others/libcriu/test_notify.c +++ b/test/others/libcriu/test_notify.c @@ -10,8 +10,6 @@ #include "lib.h" -#define SUCC_ECODE 42 - static int actions_called = 0; static int notify(char *action, criu_notify_arg_t na) { diff --git a/test/others/libcriu/test_pre_dump.c b/test/others/libcriu/test_pre_dump.c new file mode 100644 index 0000000000..ed9cd2125b --- /dev/null +++ b/test/others/libcriu/test_pre_dump.c @@ -0,0 +1,151 @@ +#include "criu.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lib.h" + +static int wdir_fd, cur_imgdir = -1; + +static int stop = 0; +static void sh(int sig) +{ + stop = 1; +} + +static void open_imgdir(void) +{ + char p[10]; + static int id = 0; + + if (id > 0) { + sprintf(p, "../dir-%d", id); + criu_set_parent_images(p); + } + if (cur_imgdir != -1) + close(cur_imgdir); + sprintf(p, "dir-%d", ++id); + mkdirat(wdir_fd, p, 0700); + cur_imgdir = openat(wdir_fd, p, O_DIRECTORY); + criu_set_images_dir_fd(cur_imgdir); +} + +int main(int argc, char **argv) +{ + int pid, ret, p[2]; + + wdir_fd = open(argv[2], O_DIRECTORY); + if (wdir_fd < 0) { + perror("Can't open wdir"); + return 1; + } + + printf("--- Start loop ---\n"); + pipe(p); + pid = fork(); + if (pid < 0) { + perror("Can't"); + return -1; + } + + if (!pid) { + printf(" `- loop: initializing\n"); + if (setsid() < 0) + exit(1); + if (signal(SIGUSR1, sh) == SIG_ERR) + exit(1); + + close(0); + close(1); + close(2); + close(p[0]); + + ret = SUCC_ECODE; + write(p[1], &ret, sizeof(ret)); + close(p[1]); + + while (!stop) + sleep(1); + exit(SUCC_ECODE); + } + + close(p[1]); + + /* Wait for kid to start */ + ret = -1; + read(p[0], &ret, sizeof(ret)); + if (ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto err; + } + + /* Wait for pipe to get closed, then dump */ + read(p[0], &ret, 1); + close(p[0]); + + printf("--- Dump loop ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_pid(pid); + criu_set_log_file("dump.log"); + criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_track_mem(true); + + open_imgdir(); + ret = criu_pre_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + goto err; + } + + printf(" `- Pre Dump 1 succeeded\n"); + + open_imgdir(); + ret = criu_pre_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + goto err; + } + + printf(" `- Pre Dump 2 succeeded\n"); + + open_imgdir(); + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + goto err; + } + + printf(" `- Final Dump succeeded\n"); + waitpid(pid, NULL, 0); + + printf("--- Restore ---\n"); + criu_init_opts(); + criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_log_file("restore.log"); + criu_set_images_dir_fd(cur_imgdir); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + return -1; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); +err: + if (waitpid(pid, &ret, 0) < 0) { + perror(" Can't wait kid"); + return -1; + } + + return chk_exit(ret, SUCC_ECODE); +} diff --git a/test/others/libcriu/test_sub.c b/test/others/libcriu/test_sub.c index 697abf5d55..af1e09408c 100644 --- a/test/others/libcriu/test_sub.c +++ b/test/others/libcriu/test_sub.c @@ -15,8 +15,6 @@ static void sh(int sig) stop = 1; } -#define SUCC_ECODE 42 - int main(int argc, char **argv) { int pid, ret, fd, p[2]; diff --git a/test/zdtm.py b/test/zdtm.py index 0a52e1b96c..14e6aa1b07 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -202,6 +202,8 @@ def __copy_one(self, fname): def __copy_libs(self, binary): ldd = subprocess.Popen(["ldd", binary], stdout=subprocess.PIPE) + stdout, _ = ldd.communicate() + xl = re.compile( r'^(linux-gate.so|linux-vdso(64)?.so|not a dynamic|.*\s*ldd\s)') @@ -216,11 +218,9 @@ def __copy_libs(self, binary): map( lambda x: str(x).strip(), filter(lambda x: str(x).startswith('\t'), - ldd.stdout.read().decode( + stdout.decode( 'ascii').splitlines()))))) - ldd.wait() - for lib in libs: if not os.access(lib, os.F_OK): raise test_fail_exc("Can't find lib %s required by %s" % @@ -331,8 +331,7 @@ def decode_flav(i): def tail(path): p = subprocess.Popen(['tail', '-n1', path], stdout=subprocess.PIPE) - out = p.stdout.readline() - p.wait() + out, _ = p.communicate() return out.decode() @@ -590,11 +589,12 @@ def gone(self, force=True): os.unlink(self.__pidfile()) def print_output(self): - if os.access(self.__name + '.out', os.R_OK): - print("Test output: " + "=" * 32) - with open(self.__name + '.out') as output: - print(output.read()) - print(" <<< " + "=" * 32) + for postfix in ['.out', '.out.inprogress']: + if os.access(self.__name + postfix, os.R_OK): + print("Test output: " + "=" * 32) + with open(self.__name + postfix) as output: + print(output.read()) + print(" <<< " + "=" * 32) def static(self): return self.__name.split('/')[1] == 'static' @@ -608,9 +608,11 @@ def blocking(self): @staticmethod def available(): if not os.access("umount2", os.X_OK): - subprocess.check_call(["make", "umount2"]) + subprocess.check_call( + ["make", "umount2"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm_ct", os.X_OK): - subprocess.check_call(["make", "zdtm_ct"]) + subprocess.check_call( + ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) subprocess.check_call( @@ -801,7 +803,7 @@ def __init__(self, name, desc, flavor, freezer): if flavor.ns: self.__real_name = name with open(name) as fd: - self.__subs = map(lambda x: x.strip(), fd.readlines()) + self.__subs = list(map(lambda x: x.strip(), fd.readlines())) print("Subs:\n%s" % '\n'.join(self.__subs)) else: self.__real_name = '' @@ -819,8 +821,8 @@ def __get_start_cmd(self, name): subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout=subprocess.PIPE) - cmd = s.stdout.readlines().pop().strip() - s.wait() + out, _ = s.communicate() + cmd = out.decode().splitlines()[-1].strip() return 'cd /' + tdir + ' && ' + cmd @@ -1039,6 +1041,7 @@ def __init__(self, opts): self.__lazy_pages_p = None self.__page_server_p = None self.__dump_process = None + self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] self.__crit_bin = opts['crit_bin'] @@ -1065,6 +1068,11 @@ def fini(self): self.__dump_process = None if ret: raise test_fail_exc("criu dump exited with %s" % ret) + if self.__img_streamer_process: + ret = self.wait_for_criu_image_streamer() + if ret: + raise test_fail_exc("criu-image-streamer exited with %s" % ret) + return def logs(self): @@ -1219,8 +1227,10 @@ def check_pages_counts(self): stent['pages_written']) if self.__stream: - p = self.spawn_criu_image_streamer("extract") - p.wait() + self.spawn_criu_image_streamer("extract") + ret = self.wait_for_criu_image_streamer() + if ret: + raise test_fail_exc("criu-image-streamer (extract) exited with %s" % ret) real_written = 0 for f in os.listdir(self.__ddir()): @@ -1262,6 +1272,8 @@ def spawn_criu_image_streamer(self, action): "--progress-fd {progress_fd}", action] + log = open(os.path.join(self.__ddir(), "img-streamer.log"), "w") + # * As we are using a shell pipe command, we want to use pipefail. # Otherwise, failures stay unnoticed. For this, we use bash as sh # doesn't support that feature. @@ -1270,7 +1282,9 @@ def spawn_criu_image_streamer(self, action): progress_fd=progress_w, images_dir=self.__ddir(), img_file=os.path.join(self.__ddir(), STREAMED_IMG_FILE_NAME) - )], close_fds=False) + )], stderr=log, close_fds=False) + + log.close() os.close(progress_w) progress = os.fdopen(progress_r, "r") @@ -1287,7 +1301,15 @@ def spawn_criu_image_streamer(self, action): raise test_fail_exc( "criu-image-streamer is not starting (exit_code=%d)" % p.wait()) - return p + progress.close() + + self.__img_streamer_process = p + + def wait_for_criu_image_streamer(self): + ret = self.__img_streamer_process.wait() + grep_errors(os.path.join(self.__ddir(), "img-streamer.log")) + self.__img_streamer_process = None + return ret def dump(self, action, opts=[]): self.__iter += 1 @@ -1319,7 +1341,7 @@ def dump(self, action, opts=[]): a_opts += self.__test.getdopts() if self.__stream: - streamer_p = self.spawn_criu_image_streamer("capture") + self.spawn_criu_image_streamer("capture") a_opts += ["--stream"] if self.__dedup: @@ -1347,9 +1369,9 @@ def dump(self, action, opts=[]): opts=a_opts + opts, nowait=nowait) if self.__stream: - ret = streamer_p.wait() + ret = self.wait_for_criu_image_streamer() if ret: - raise test_fail_exc("criu-image-streamer exited with %d" % ret) + raise test_fail_exc("criu-image-streamer (capture) exited with %d" % ret) if self.__mdedup and self.__iter > 1: self.__criu_act("dedup", opts=[]) @@ -1382,7 +1404,7 @@ def restore(self): r_opts += ['--action-script', os.getcwd() + '/empty-netns-prep.sh'] if self.__stream: - streamer_p = self.spawn_criu_image_streamer("serve") + self.spawn_criu_image_streamer("serve") r_opts += ["--stream"] if self.__dedup: @@ -1419,9 +1441,9 @@ def restore(self): self.__criu_act("restore", opts=r_opts + ["--restore-detached"]) if self.__stream: - ret = streamer_p.wait() + ret = self.wait_for_criu_image_streamer() if ret: - raise test_fail_exc("criu-image-streamer exited with %d" % ret) + raise test_fail_exc("criu-image-streamer (serve) exited with %d" % ret) self.show_stats("restore") @@ -1453,19 +1475,23 @@ def kill(self): self.__lazy_pages_p.terminate() print("criu lazy-pages exited with %s" % self.__lazy_pages_p.wait()) - grep_errors(os.path.join(self.__ddir(), "lazy-pages.log")) + grep_errors(os.path.join(self.__ddir(), "lazy-pages.log"), err=True) self.__lazy_pages_p = None if self.__page_server_p: self.__page_server_p.terminate() print("criu page-server exited with %s" % self.__page_server_p.wait()) - grep_errors(os.path.join(self.__ddir(), "page-server.log")) + grep_errors(os.path.join(self.__ddir(), "page-server.log"), err=True) self.__page_server_p = None if self.__dump_process: self.__dump_process.terminate() - print("criu dump exited with %s" % self.__dump_process.wait()) + print("criu dump exited with %s" % self.__dump_process.wait(), err=True) grep_errors(os.path.join(self.__ddir(), "dump.log")) self.__dump_process = None + if self.__img_streamer_process: + self.__img_streamer_process.terminate() + ret = self.wait_for_criu_image_streamer() + print("criu-image-streamer exited with %s" % ret) def try_run_hook(test, args): @@ -1973,7 +1999,22 @@ def run_test(self, name, desc, flavor): raise Exception("The kernel is tainted: %r (%r)" % (taint, self.__taint)) - if test_flag(desc, 'excl'): + ''' + The option --link-remap allows criu to hardlink open files back to the + file-system on dump (should be removed on restore) and we have a sanity + check in check_visible_state that they were actually removed at least + from the root test directory after restore. + + As zdtm runs all tests from the same cwd (e.g.: test/zdtm/static) in + parallel, hardlinks from one test can mess up with sanity checks of + another test or even one test can by mistake use hardlinks created by + another test which is even worse. + + So let's make all tests using --link-remap option non parallel. + ''' + link_remap_excl = '--link-remap' in desc.get('opts', '').split() + desc.get('dopts', '').split() + desc.get('ropts', '').split() + + if test_flag(desc, 'excl') or link_remap_excl: self.wait_all() self.__nr += 1 @@ -2006,7 +2047,10 @@ def run_test(self, name, desc, flavor): "start": time.time() } - if test_flag(desc, 'excl'): + if log: + log.close() + + if test_flag(desc, 'excl') or link_remap_excl: self.wait() def __wait_one(self, flags): @@ -2029,6 +2073,9 @@ def __wait_one(self, flags): self.__runtest += 1 if pid != 0: sub = self.__subs.pop(pid) + # The following wait() is not useful for our domain logic. + # It's useful for taming warnings in subprocess.Popen.__del__() + sub['sub'].wait() tc = None if self.__junit_test_cases is not None: tc = TestCase(sub['name'], @@ -2129,9 +2176,9 @@ def all_tests(opts): continue files.append(fp) excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) - tlist = filter( + tlist = list(filter( lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and - x not in excl, map(lambda x: x.strip(), files)) + x not in excl, map(lambda x: x.strip(), files))) return tlist diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 69154fdc96..d345233154 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -25,7 +25,7 @@ ifeq ($(ARCH),arm) ifeq ($(ARMV),6) USERCFLAGS += -march=armv6 else ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a + USERCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) # To build aarch32 on armv8 Travis-CI (see criu Makefile) USERCFLAGS += -march=armv7-a diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 81da81ebab..57eb42046a 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -20,9 +20,11 @@ #include "ns.h" futex_t sig_received; +/* clang-format off */ static struct { futex_t stage; -} * test_shared_state; +} *test_shared_state; +/* clang-format on */ enum { TEST_INIT_STAGE = 0, diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index c9e6589f07..4a21978b59 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -68,9 +68,13 @@ TST_NOFILE := \ utsname \ pstree \ sockets01 \ + sockets01-seqpacket \ sockets02 \ + sockets02-seqpacket \ sockets_spair \ + sockets_spair_seqpacket \ socket_queues \ + socket_queues_seqpacket \ socket-raw \ socket-tcp \ socket-tcp-listen \ @@ -107,6 +111,7 @@ TST_NOFILE := \ socket-tcp4v6-closed \ socket-tcp-close0 \ socket-tcp-close1 \ + socket-tcp-close2 \ socket-dump-tcp-close \ socket-tcp-unconn \ socket-tcp6-unconn \ @@ -116,7 +121,9 @@ TST_NOFILE := \ socket-linger \ sock_opts00 \ sock_opts01 \ + sock_opts02 \ sk-unix-unconn \ + sk-unix-unconn-seqpacket \ ipc_namespace \ selfexe00 \ sem \ @@ -185,6 +192,7 @@ TST_NOFILE := \ scm01 \ scm02 \ scm03 \ + scm03-seqpacket \ scm04 \ scm05 \ scm06 \ @@ -282,6 +290,7 @@ TST_FILE = \ file_attr \ deleted_unix_sock \ sk-unix-rel \ + sk-unix-rel-seqpacket \ deleted_dev \ unlink_fstat00 \ unlink_fstat01 \ @@ -309,7 +318,9 @@ TST_FILE = \ cow01 \ fdt_shared \ sockets00 \ + sockets00-seqpacket \ sockets03 \ + sockets03-seqpacket \ sockets_dgram \ file_lease00 \ file_lease01 \ @@ -392,8 +403,10 @@ TST_DIR = \ mnt_enablefs \ autofs \ del_standalone_un \ + del_standalone_un_seqpacket \ sk-unix-mntns \ sk-unix01 \ + sk-unix01-seqpacket \ sk-unix-dgram-ghost \ unsupported_children_collision \ shared_slave_mount_children \ @@ -605,6 +618,18 @@ socket-tcp6-unconn: CFLAGS += -D ZDTM_IPV6 socket-tcp4v6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV4V6 socket-tcp4v6-closing: CFLAGS += -D ZDTM_IPV4V6 +sockets00-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +sockets01-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +sockets02-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +sockets03-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +sk-unix01-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +sk-unix-rel-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +sockets_spair_seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +socket_queues_seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +del_standalone_un_seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +sk-unix-unconn-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET +scm03-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET + pty-console: CFLAGS += -D ZDTM_DEV_CONSOLE shm-unaligned: CFLAGS += -DZDTM_SHM_UNALIGNED diff --git a/test/zdtm/static/aio01.c b/test/zdtm/static/aio01.c index ed45192b97..100069b03d 100644 --- a/test/zdtm/static/aio01.c +++ b/test/zdtm/static/aio01.c @@ -14,8 +14,8 @@ const char *test_doc = "Check head and tail restore correct"; const char *test_author = "Kirill Tkhai "; struct aio_ring { - unsigned id; /* kernel internal index number */ - unsigned nr; /* number of io_events */ + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; diff --git a/test/zdtm/static/auto_dev-ioctl.h b/test/zdtm/static/auto_dev-ioctl.h index e65259b307..1b35fe2f7d 100644 --- a/test/zdtm/static/auto_dev-ioctl.h +++ b/test/zdtm/static/auto_dev-ioctl.h @@ -95,7 +95,7 @@ struct args_ismountpoint { struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; - __u32 size; /* total size of data passed in + __u32 size; /* total size of data passed in * including this struct */ __s32 ioctlfd; /* automount command fd */ diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c index 2d6078627f..ad17958427 100644 --- a/test/zdtm/static/autofs.c +++ b/test/zdtm/static/autofs.c @@ -47,6 +47,7 @@ static char *xvstrcat(char *str, const char *fmt, va_list args) ret = -ENOMEM; new = realloc(str, offset + delta); if (new) { + str = new; va_copy(tmp, args); ret = vsnprintf(new + offset, delta, fmt, tmp); va_end(tmp); @@ -54,7 +55,6 @@ static char *xvstrcat(char *str, const char *fmt, va_list args) /* NOTE: vsnprintf returns the amount of bytes * * to allocate. */ delta = ret + 1; - str = new; ret = 0; } } @@ -266,6 +266,7 @@ static int check_automount(struct autofs_params *p) return err; free(mountpoint); + mountpoint = NULL; err = p->setup(p); if (err) { @@ -274,7 +275,7 @@ static int check_automount(struct autofs_params *p) } if (close(p->fd)) { - pr_perror("%s: failed to close fd %d", mountpoint, p->fd); + pr_perror("mountpoint failed to close fd %d", p->fd); return -errno; } diff --git a/test/zdtm/static/child_subreaper_and_reparent.c b/test/zdtm/static/child_subreaper_and_reparent.c index ba03517bae..c71778ae9f 100644 --- a/test/zdtm/static/child_subreaper_and_reparent.c +++ b/test/zdtm/static/child_subreaper_and_reparent.c @@ -19,11 +19,13 @@ enum { TEST_EXIT, }; +/* clang-format off */ struct shared { futex_t fstate; int parent_before_cr; int parent_after_cr; -} * sh; +} *sh; +/* clang-format on */ int orphan(void) { diff --git a/test/zdtm/static/child_subreaper_existing_child.c b/test/zdtm/static/child_subreaper_existing_child.c index 4805aa41d6..92d22bc4a5 100644 --- a/test/zdtm/static/child_subreaper_existing_child.c +++ b/test/zdtm/static/child_subreaper_existing_child.c @@ -18,10 +18,12 @@ enum { TEST_EXIT, }; +/* clang-format off */ struct shared { futex_t fstate; int ppid_after_reparent; -} * sh; +} *sh; +/* clang-format on */ int orphan(void) { diff --git a/test/zdtm/static/del_standalone_un.c b/test/zdtm/static/del_standalone_un.c index c9fa84870b..b4f99e2606 100644 --- a/test/zdtm/static/del_standalone_un.c +++ b/test/zdtm/static/del_standalone_un.c @@ -16,11 +16,17 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + static int bind_and_listen(struct sockaddr_un *addr) { int sk; - sk = socket(PF_UNIX, SOCK_STREAM, 0); + sk = socket(PF_UNIX, SOCK_TYPE, 0); if (sk < 0) { fail("socket"); return -1; diff --git a/test/zdtm/static/del_standalone_un_seqpacket.c b/test/zdtm/static/del_standalone_un_seqpacket.c new file mode 120000 index 0000000000..d88fcbad8d --- /dev/null +++ b/test/zdtm/static/del_standalone_un_seqpacket.c @@ -0,0 +1 @@ +del_standalone_un.c \ No newline at end of file diff --git a/test/zdtm/static/file_fown.c b/test/zdtm/static/file_fown.c index eb42a826eb..2c5ba82c2b 100644 --- a/test/zdtm/static/file_fown.c +++ b/test/zdtm/static/file_fown.c @@ -22,12 +22,14 @@ const char *test_doc = "Check for signal delivery on file owners"; const char *test_author = "Cyrill Gorcunov "; +/* clang-format off */ struct params { int sigio; int pipe_flags[2]; int pipe_pid[2]; int pipe_sig[2]; -} * shared; +} *shared; +/* clang-format on */ static void signal_handler_io(int status) { diff --git a/test/zdtm/static/file_locks00.c b/test/zdtm/static/file_locks00.c index 0b5d1313b2..01782fa7a4 100644 --- a/test/zdtm/static/file_locks00.c +++ b/test/zdtm/static/file_locks00.c @@ -23,10 +23,10 @@ static int lock_reg(int fd, int cmd, int type, int whence, off_t offset, off_t l { struct flock lock; - lock.l_type = type; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_type = type; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ - lock.l_start = offset; /* byte offset, relative to l_whence */ - lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ errno = 0; return fcntl(fd, cmd, &lock); @@ -40,10 +40,10 @@ static int check_read_lock(int fd, int whence, off_t offset, off_t len) struct flock lock; int ret; - lock.l_type = F_RDLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_type = F_RDLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ - lock.l_start = offset; /* byte offset, relative to l_whence */ - lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ lock.l_pid = -1; errno = 0; @@ -69,10 +69,10 @@ static int check_write_lock(int fd, int whence, off_t offset, off_t len) int ret; pid_t ppid = getppid(); - lock.l_type = F_WRLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_type = F_WRLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ - lock.l_start = offset; /* byte offset, relative to l_whence */ - lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ lock.l_pid = -1; errno = 0; diff --git a/test/zdtm/static/ipc_namespace.c b/test/zdtm/static/ipc_namespace.c index 98241d8163..b13b357bac 100644 --- a/test/zdtm/static/ipc_namespace.c +++ b/test/zdtm/static/ipc_namespace.c @@ -19,27 +19,28 @@ extern int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf); struct ipc_ids { int in_use; /* TODO: Check for 0 */ - // unsigned short seq; - // unsigned short seq_max; - // struct rw_semaphore rw_mutex; - // struct idr ipcs_idr; /* TODO */ + + // unsigned short seq; + // unsigned short seq_max; + // struct rw_semaphore rw_mutex; + // struct idr ipcs_idr; /* TODO */ }; struct ipc_ns { struct ipc_ids ids[3]; - int sem_ctls[4]; // + - int used_sems; // + + int sem_ctls[4]; + int used_sems; - int msg_ctlmax; // + - int msg_ctlmnb; // + - int msg_ctlmni; // + - int msg_bytes; // + - int msg_hdrs; // + - int auto_msgmni; // + - int msg_next_id; // + - int sem_next_id; // + - int shm_next_id; // + + int msg_ctlmax; + int msg_ctlmnb; + int msg_ctlmni; + int msg_bytes; + int msg_hdrs; + int auto_msgmni; + int msg_next_id; + int sem_next_id; + int shm_next_id; size_t shm_ctlmax; size_t shm_ctlall; @@ -51,10 +52,10 @@ struct ipc_ns { // unsigned int mq_queues_count; - unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ - unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ - unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ - unsigned int mq_msg_default; /* initialized to DFLT_MSG */ + unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ + unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ + unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ + unsigned int mq_msg_default; /* initialized to DFLT_MSG */ unsigned int mq_msgsize_default; /* initialized to DFLT_MSGSIZE */ struct user_ns *user_ns; diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c index 10a4cac790..b1e55e8614 100644 --- a/test/zdtm/static/maps00.c +++ b/test/zdtm/static/maps00.c @@ -158,7 +158,13 @@ static int check_map(struct map *map) if (!sigsetjmp(segv_ret, 1)) { if (map->prot & PROT_WRITE) { - memcpy(map->ptr, test_func, getpagesize()); + memcpy(map->ptr, test_func, ONE_MAP_SIZE); + /* The ARM ARM architecture does not require the + * hardware to ensure coherency between instruction + * caches and memory, flushing dcache and icache is + * necessory to prevent SIGILL signal. + */ + __builtin___clear_cache(map->ptr, map->ptr + ONE_MAP_SIZE); } else { if (!(map->flag & MAP_ANONYMOUS)) { uint8_t funlen = (uint8_t *)check_map - (uint8_t *)test_func; diff --git a/test/zdtm/static/mntns-deleted-dst b/test/zdtm/static/mntns-deleted-dst deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/zdtm/static/mntns_ghost01.c b/test/zdtm/static/mntns_ghost01.c index 20397d543a..2cc2270dd8 100644 --- a/test/zdtm/static/mntns_ghost01.c +++ b/test/zdtm/static/mntns_ghost01.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "zdtmtst.h" @@ -89,6 +90,13 @@ int main(int argc, char **argv) return 1; } + fd = open(ghost_path, O_CREAT | O_WRONLY, 0600); + if (fd >= 0 || errno != EROFS) { + pr_perror("open for write on rofs -> %d", fd); + close(fd); + return 1; + } + return 0; } diff --git a/test/zdtm/static/mprotect00.c b/test/zdtm/static/mprotect00.c index 006b647729..717b7ddcf2 100644 --- a/test/zdtm/static/mprotect00.c +++ b/test/zdtm/static/mprotect00.c @@ -44,10 +44,12 @@ static int check_prot(char *ptr, int prot) fail("PROT_READ bypassed"); return -1; } - } else /* we come here on return from SIGSEGV handler */ + } else { + /* we come here on return from SIGSEGV handler */ if (prot & PROT_READ) { - fail("PROT_READ rejected"); - return -1; + fail("PROT_READ rejected"); + return -1; + } } if (!sigsetjmp(segv_ret, 1)) { @@ -56,10 +58,12 @@ static int check_prot(char *ptr, int prot) fail("PROT_WRITE bypassed"); return -1; } - } else /* we come here on return from SIGSEGV handler */ + } else { + /* we come here on return from SIGSEGV handler */ if (prot & PROT_WRITE) { - fail("PROT_WRITE rejected"); - return -1; + fail("PROT_WRITE rejected"); + return -1; + } } if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index e220daa7f4..1e6ee1dea5 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -55,36 +55,36 @@ struct range { }; struct range rand_range4[] = { - { 0, 1 }, /* accept_local */ - { -1, 0 }, /* accept_source_route */ - { 0, 1 }, /* arp_accept */ - { 0, 2 }, /* arp_announce */ - { 0, 1 }, /* arp_filter */ - { 0, 8 }, /* arp_ignore */ - { 0, 1 }, /* arp_notify */ - { 0, 1 }, /* bootp_relay */ - { 0, 1 }, /* disable_policy */ - { 0, 1 }, /* disable_xfrm */ - { 0, 1 }, /* drop_gratuitous_arp */ - { 0, 1 }, /* drop_unicast_in_l2_multicast */ - { 0, INT_MAX }, /* force_igmp_version */ - { 0, 1 }, /* forwarding */ - { 0, 1 }, /* accept_redirects */ - { 0, INT_MAX }, /* igmpv2_unsolicited_report_interval */ - { 0, INT_MAX }, /* igmpv3_unsolicited_report_interval */ - { 0, 1 }, /* ignore_routes_with_linkdown */ - { 0, 1 }, /* log_martians */ - { 0, 1 }, /* mc_forwarding */ - { -1, INT_MAX }, /* medium_id */ - { 0, 1 }, /* promote_secondaries */ - { 0, 1 }, /* proxy_arp */ - { 0, 1 }, /* proxy_arp_pvlan */ - { 0, 1 }, /* route_localnet */ - { 0, 2 }, /* rp_filter */ - { 0, 1 }, /* secure_redirects */ - { 0, 1 }, /* send_redirects */ - { 0, 1 }, /* shared_media */ - { 0, 1 }, /* src_valid_mark */ + { 0, 1 }, /* accept_local */ + { -1, 0 }, /* accept_source_route */ + { 0, 1 }, /* arp_accept */ + { 0, 2 }, /* arp_announce */ + { 0, 1 }, /* arp_filter */ + { 0, 8 }, /* arp_ignore */ + { 0, 1 }, /* arp_notify */ + { 0, 1 }, /* bootp_relay */ + { 0, 1 }, /* disable_policy */ + { 0, 1 }, /* disable_xfrm */ + { 0, 1 }, /* drop_gratuitous_arp */ + { 0, 1 }, /* drop_unicast_in_l2_multicast */ + { 0, INT_MAX }, /* force_igmp_version */ + { 0, 1 }, /* forwarding */ + { 0, 1 }, /* accept_redirects */ + { 0, INT_MAX }, /* igmpv2_unsolicited_report_interval */ + { 0, INT_MAX }, /* igmpv3_unsolicited_report_interval */ + { 0, 1 }, /* ignore_routes_with_linkdown */ + { 0, 1 }, /* log_martians */ + { 0, 1 }, /* mc_forwarding */ + { -1, INT_MAX }, /* medium_id */ + { 0, 1 }, /* promote_secondaries */ + { 0, 1 }, /* proxy_arp */ + { 0, 1 }, /* proxy_arp_pvlan */ + { 0, 1 }, /* route_localnet */ + { 0, 2 }, /* rp_filter */ + { 0, 1 }, /* secure_redirects */ + { 0, 1 }, /* send_redirects */ + { 0, 1 }, /* shared_media */ + { 0, 1 }, /* src_valid_mark */ { INT_MIN, INT_MAX }, /* tag */ }; @@ -139,47 +139,47 @@ char *devconfs6[] = { #define MAX_ADDRESSES 128 struct range rand_range6[] = { - { 0, 2 }, /* accept_dad */ - { 0, 2 }, /* accept_ra */ - { 0, 1 }, /* accept_ra_defrtr */ - { 0, 1 }, /* accept_ra_from_local */ - { 0, INT_MAX }, /* accept_ra_min_hop_limit */ - { 0, 1 }, /* accept_ra_mtu */ - { 0, 1 }, /* accept_ra_pinfo */ - { 0, INT_MAX }, /* accept_ra_rt_info_max_plen */ - { 0, 1 }, /* accept_ra_rtr_pref */ - { -1, 0 }, /* accept_source_route */ - { 0, 1 }, /* autoconf */ - { 0, INT_MAX }, /* dad_transmits */ - { 0, 1 }, /* disable_ipv6 */ - { 0, 1 }, /* drop_unicast_in_l2_multicast */ - { 0, 1 }, /* drop_unsolicited_na */ - { 0, 2 }, /* force_mld_version */ - { 0, 1 }, /* force_tllao */ - { 0, 1 }, /* forwarding */ - { 0, 1 }, /* accept_redirects */ - { 1, 255 }, /* hop_limit */ - { 0, 1 }, /* ignore_routes_with_linkdown */ - { -1, 1 }, /* keep_addr_on_down */ - { 0, MAX_ADDRESSES }, /* max_addresses */ - { 0, INT_MAX }, /* max_desync_factor */ - { 0, INT_MAX }, /* mldv1_unsolicited_report_interval */ - { 0, INT_MAX }, /* mldv2_unsolicited_report_interval */ + { 0, 2 }, /* accept_dad */ + { 0, 2 }, /* accept_ra */ + { 0, 1 }, /* accept_ra_defrtr */ + { 0, 1 }, /* accept_ra_from_local */ + { 0, INT_MAX }, /* accept_ra_min_hop_limit */ + { 0, 1 }, /* accept_ra_mtu */ + { 0, 1 }, /* accept_ra_pinfo */ + { 0, INT_MAX }, /* accept_ra_rt_info_max_plen */ + { 0, 1 }, /* accept_ra_rtr_pref */ + { -1, 0 }, /* accept_source_route */ + { 0, 1 }, /* autoconf */ + { 0, INT_MAX }, /* dad_transmits */ + { 0, 1 }, /* disable_ipv6 */ + { 0, 1 }, /* drop_unicast_in_l2_multicast */ + { 0, 1 }, /* drop_unsolicited_na */ + { 0, 2 }, /* force_mld_version */ + { 0, 1 }, /* force_tllao */ + { 0, 1 }, /* forwarding */ + { 0, 1 }, /* accept_redirects */ + { 1, 255 }, /* hop_limit */ + { 0, 1 }, /* ignore_routes_with_linkdown */ + { -1, 1 }, /* keep_addr_on_down */ + { 0, MAX_ADDRESSES }, /* max_addresses */ + { 0, INT_MAX }, /* max_desync_factor */ + { 0, INT_MAX }, /* mldv1_unsolicited_report_interval */ + { 0, INT_MAX }, /* mldv2_unsolicited_report_interval */ { IPV6_MIN_MTU, IPV6_MIN_MTU }, /* mtu */ - { 0, 1 }, /* ndisc_notify */ - { 0, 1 }, /* optimistic_dad */ - { 0, 1 }, /* proxy_ndp */ - { 0, INT_MAX }, /* regen_max_retry */ - { 0, ROUTER_MAX }, /* router_probe_interval */ - { 0, ROUTER_MAX }, /* router_solicitation_delay */ - { 0, ROUTER_MAX }, /* router_solicitation_interval */ - { 0, ROUTER_MAX }, /* router_solicitations */ - { 0, 1 }, /* suppress_frag_ndisc */ - { 0, INT_MAX }, /* temp_prefered_lft */ - { 0, INT_MAX }, /* temp_valid_lft */ - { 0, 1 }, /* use_oif_addrs_only */ - { 0, 1 }, /* use_optimistic */ - { 0, 2 }, /* use_tempaddr */ + { 0, 1 }, /* ndisc_notify */ + { 0, 1 }, /* optimistic_dad */ + { 0, 1 }, /* proxy_ndp */ + { 0, INT_MAX }, /* regen_max_retry */ + { 0, ROUTER_MAX }, /* router_probe_interval */ + { 0, ROUTER_MAX }, /* router_solicitation_delay */ + { 0, ROUTER_MAX }, /* router_solicitation_interval */ + { 0, ROUTER_MAX }, /* router_solicitations */ + { 0, 1 }, /* suppress_frag_ndisc */ + { 0, INT_MAX }, /* temp_prefered_lft */ + { 0, INT_MAX }, /* temp_valid_lft */ + { 0, 1 }, /* use_oif_addrs_only */ + { 0, 1 }, /* use_optimistic */ + { 0, 2 }, /* use_tempaddr */ }; struct test_conf { diff --git a/test/zdtm/static/s390x_regs_check.c b/test/zdtm/static/s390x_regs_check.c index 8d6b479974..40c480b3f2 100644 --- a/test/zdtm/static/s390x_regs_check.c +++ b/test/zdtm/static/s390x_regs_check.c @@ -59,11 +59,11 @@ static int pipefd[2]; */ struct reg_set { const char *name; /* Name of regset */ - int nr; /* Number of regset */ - void *data; /* Test data */ - int len; /* Number of bytes of test data */ - bool optional; /* Not all kernels/machines have this reg set */ - bool available; /* Current kernel/machine has this reg set */ + int nr; /* Number of regset */ + void *data; /* Test data */ + int len; /* Number of bytes of test data */ + bool optional; /* Not all kernels/machines have this reg set */ + bool available; /* Current kernel/machine has this reg set */ }; /* @@ -397,8 +397,8 @@ static inline void send_tid_and_loop(int fd) asm volatile("lgr 2,%0\n" /* Arg 1: fd */ "la 3,%1\n" /* Arg 2: &tid */ - "lghi 4,4\n" /* Arg 3: sizeof(int) */ - "svc 4\n" /* __NR_write SVC: */ + "lghi 4,4\n" /* Arg 3: sizeof(int) */ + "svc 4\n" /* __NR_write SVC: */ /* After SVC no more registers are changed */ "0: j 0b\n" /* Loop here */ : diff --git a/test/zdtm/static/scm03-seqpacket.c b/test/zdtm/static/scm03-seqpacket.c new file mode 120000 index 0000000000..f1f86dd8be --- /dev/null +++ b/test/zdtm/static/scm03-seqpacket.c @@ -0,0 +1 @@ +scm03.c \ No newline at end of file diff --git a/test/zdtm/static/scm03.c b/test/zdtm/static/scm03.c index a40fc01015..4453f7e936 100644 --- a/test/zdtm/static/scm03.c +++ b/test/zdtm/static/scm03.c @@ -9,6 +9,12 @@ const char *test_doc = "Check that SCM_RIGHTS are preserved"; const char *test_author = "Pavel Emelyanov "; +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_DGRAM +#endif + static int send_fd(int via, int fd1, int fd2) { struct msghdr h = {}; @@ -105,7 +111,7 @@ int main(int argc, char **argv) test_init(argc, argv); - if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk) < 0) { + if (socketpair(PF_UNIX, SOCK_TYPE, 0, sk) < 0) { pr_perror("Can't make unix pair"); exit(1); } diff --git a/test/zdtm/static/session01.c b/test/zdtm/static/session01.c index 0f727a9a67..31a617de93 100644 --- a/test/zdtm/static/session01.c +++ b/test/zdtm/static/session01.c @@ -40,22 +40,22 @@ enum { static struct testcase *testcases; static futex_t *fstate; static struct testcase __testcases[] = { - { 2, 1, 2, 1, 2, 1 }, /* session00 */ - { 4, 2, 4, 2, 4, 1 }, /* |\_session00 */ - { 15, 4, 4, 4, 15, 1 }, /* | |\_session00 */ - { 16, 4, 4, 4, 15, 1 }, /* | \_session00 */ - { 17, 4, 4, 4, 17, 0 }, /* | |\_session00 */ - { 18, 4, 4, 4, 17, 1 }, /* | \_session00 */ - { 5, 2, 2, 2, 2, 1 }, /* |\_session00 */ - { 8, 2, 8, 2, 8, 1 }, /* |\_session00 */ - { 9, 8, 2, 2, 2, 1 }, /* | \_session00 */ - { 10, 2, 10, 2, 10, 1 }, /* |\_session00 */ + { 2, 1, 2, 1, 2, 1 }, /* session00 */ + { 4, 2, 4, 2, 4, 1 }, /* |\_session00 */ + { 15, 4, 4, 4, 15, 1 }, /* | |\_session00 */ + { 16, 4, 4, 4, 15, 1 }, /* | \_session00 */ + { 17, 4, 4, 4, 17, 0 }, /* | |\_session00 */ + { 18, 4, 4, 4, 17, 1 }, /* | \_session00 */ + { 5, 2, 2, 2, 2, 1 }, /* |\_session00 */ + { 8, 2, 8, 2, 8, 1 }, /* |\_session00 */ + { 9, 8, 2, 2, 2, 1 }, /* | \_session00 */ + { 10, 2, 10, 2, 10, 1 }, /* |\_session00 */ { 11, 10, 11, 2, 11, 1 }, /* | \_session00 */ - { 12, 11, 2, 2, 2, 1 }, /* | \_session00 */ - { 13, 2, 2, 2, 2, 0 }, /* \_session00 */ - { 3, 13, 2, 2, 2, 1 }, /* session00 */ - { 6, 2, 6, 2, 6, 0 }, /* \_session00 */ - { 14, 6, 6, 6, 6, 1 }, /* session00 */ + { 12, 11, 2, 2, 2, 1 }, /* | \_session00 */ + { 13, 2, 2, 2, 2, 0 }, /* \_session00 */ + { 3, 13, 2, 2, 2, 1 }, /* session00 */ + { 6, 2, 6, 2, 6, 0 }, /* \_session00 */ + { 14, 6, 6, 6, 6, 1 }, /* session00 */ }; #define TESTS (sizeof(__testcases) / sizeof(struct testcase)) diff --git a/test/zdtm/static/shm-mp.c b/test/zdtm/static/shm-mp.c index 1929dac191..c95f3d84cc 100644 --- a/test/zdtm/static/shm-mp.c +++ b/test/zdtm/static/shm-mp.c @@ -33,10 +33,12 @@ static int check_prot(char *ptr, char val, int prot) fail("PROT_READ bypassed"); return -1; } - } else /* we come here on return from SIGSEGV handler */ + } else { + /* we come here on return from SIGSEGV handler */ if (prot & PROT_READ) { - fail("PROT_READ rejected"); - return -1; + fail("PROT_READ rejected"); + return -1; + } } if (!sigsetjmp(segv_ret, 1)) { @@ -45,10 +47,12 @@ static int check_prot(char *ptr, char val, int prot) fail("PROT_WRITE bypassed"); return -1; } - } else /* we come here on return from SIGSEGV handler */ + } else { + /* we come here on return from SIGSEGV handler */ if (prot & PROT_WRITE) { - fail("PROT_WRITE rejected"); - return -1; + fail("PROT_WRITE rejected"); + return -1; + } } if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { diff --git a/test/zdtm/static/sigpending.c b/test/zdtm/static/sigpending.c index 1641fdd86e..ce03ff55c9 100644 --- a/test/zdtm/static/sigpending.c +++ b/test/zdtm/static/sigpending.c @@ -18,7 +18,7 @@ static int numsig; #define TESTSIG (SIGRTMAX) #define THREADSIG (SIGRTMIN) static siginfo_t share_infos[2]; -static siginfo_t self_infos[64]; /* self */ +static siginfo_t self_infos[64]; /* self */ static siginfo_t thread_infos[3]; /* thread */ static int share_nr; static int self_nr; diff --git a/test/zdtm/static/sk-unix-rel-seqpacket.c b/test/zdtm/static/sk-unix-rel-seqpacket.c new file mode 120000 index 0000000000..1f98e3845d --- /dev/null +++ b/test/zdtm/static/sk-unix-rel-seqpacket.c @@ -0,0 +1 @@ +sk-unix-rel.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-rel.c b/test/zdtm/static/sk-unix-rel.c index 10c19080ae..7e4aeafe6c 100644 --- a/test/zdtm/static/sk-unix-rel.c +++ b/test/zdtm/static/sk-unix-rel.c @@ -25,6 +25,12 @@ TEST_OPTION(filename, string, "socket file name", 1); #define TEST_MODE 0640 +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + int main(int argc, char *argv[]) { struct sockaddr_un addr; @@ -54,8 +60,8 @@ int main(int argc, char *argv[]) memcpy(addr.sun_path, filename, addrlen); addrlen += sizeof(addr.sun_family); - sock[0] = socket(AF_UNIX, SOCK_STREAM, 0); - sock[1] = socket(AF_UNIX, SOCK_STREAM, 0); + sock[0] = socket(AF_UNIX, SOCK_TYPE, 0); + sock[1] = socket(AF_UNIX, SOCK_TYPE, 0); if (sock[0] < 0 || sock[1] < 0) { fail("socket"); exit(1); diff --git a/test/zdtm/static/sk-unix-unconn-seqpacket.c b/test/zdtm/static/sk-unix-unconn-seqpacket.c new file mode 120000 index 0000000000..f5c276186a --- /dev/null +++ b/test/zdtm/static/sk-unix-unconn-seqpacket.c @@ -0,0 +1 @@ +sk-unix-unconn.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-unconn.c b/test/zdtm/static/sk-unix-unconn.c index caad3d3157..62e48247f6 100644 --- a/test/zdtm/static/sk-unix-unconn.c +++ b/test/zdtm/static/sk-unix-unconn.c @@ -9,6 +9,12 @@ const char *test_doc = "Check unconnected unix sockets"; const char *test_author = "Vagin Andrew "; +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + int main(int argc, char **argv) { int sk, skc; @@ -19,13 +25,13 @@ int main(int argc, char **argv) test_init(argc, argv); - sk = socket(AF_UNIX, SOCK_STREAM, 0); + sk = socket(AF_UNIX, SOCK_TYPE, 0); if (sk == -1) { pr_perror("socket"); return 1; } - skc = socket(AF_UNIX, SOCK_STREAM, 0); + skc = socket(AF_UNIX, SOCK_TYPE, 0); if (skc == -1) { pr_perror("socket"); return 1; diff --git a/test/zdtm/static/sk-unix01-seqpacket.c b/test/zdtm/static/sk-unix01-seqpacket.c new file mode 120000 index 0000000000..bef734ed69 --- /dev/null +++ b/test/zdtm/static/sk-unix01-seqpacket.c @@ -0,0 +1 @@ +sk-unix01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix01-seqpacket.desc b/test/zdtm/static/sk-unix01-seqpacket.desc new file mode 120000 index 0000000000..7a30da25c8 --- /dev/null +++ b/test/zdtm/static/sk-unix01-seqpacket.desc @@ -0,0 +1 @@ +sk-unix01.desc \ No newline at end of file diff --git a/test/zdtm/static/sk-unix01.c b/test/zdtm/static/sk-unix01.c index c2bb8b9edb..5146c027f4 100644 --- a/test/zdtm/static/sk-unix01.c +++ b/test/zdtm/static/sk-unix01.c @@ -24,6 +24,12 @@ const char *test_author = "Cyrill Gorcunov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + static int sk_alloc_bind(int type, struct sockaddr_un *addr) { int sk; @@ -256,7 +262,7 @@ int main(int argc, char **argv) unlink(addr.sun_path); - sk_st[0] = sk_alloc_bind(SOCK_STREAM, &addr); + sk_st[0] = sk_alloc_bind(SOCK_TYPE, &addr); if (sk_st[0] < 0) return 1; test_msg("sk-st: alloc/bind/listen %d\n", sk_st[0]); @@ -266,7 +272,7 @@ int main(int argc, char **argv) return 1; } - sk_st[1] = sk_alloc_connect(SOCK_STREAM, &addr); + sk_st[1] = sk_alloc_connect(SOCK_TYPE, &addr); if (sk_st[1] < 0) return 1; test_msg("sk-st: alloc/connect %d\n", sk_st[1]); @@ -279,7 +285,7 @@ int main(int argc, char **argv) } test_msg("sk-st: accept %d\n", sk_st[2]); - sk_st[3] = sk_alloc_connect(SOCK_STREAM, &addr); + sk_st[3] = sk_alloc_connect(SOCK_TYPE, &addr); if (sk_st[3] < 0) return 1; test_msg("sk-st: alloc/connect %d\n", sk_st[3]); diff --git a/test/zdtm/static/sock_opts02.c b/test/zdtm/static/sock_opts02.c new file mode 100644 index 0000000000..7ea98744ac --- /dev/null +++ b/test/zdtm/static/sock_opts02.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that SO_BUF_LOCK option dumped"; +const char *test_author = "Pavel Tikhomirov "; + +#ifndef SO_BUF_LOCK +#define SO_BUF_LOCK 72 +#endif + +#ifndef SOCK_SNDBUF_LOCK +#define SOCK_SNDBUF_LOCK 1 +#endif +#ifndef SOCK_RCVBUF_LOCK +#define SOCK_RCVBUF_LOCK 2 +#endif + +#define BUFSIZE 16384 + +struct sk_opt { + int type; + uint32_t val; + uint32_t lock; +} sk_opts[] = { { SO_BUF_LOCK, 0, 0 }, + { SO_BUF_LOCK, SOCK_SNDBUF_LOCK, SOCK_SNDBUF_LOCK }, + { SO_BUF_LOCK, SOCK_RCVBUF_LOCK, SOCK_RCVBUF_LOCK }, + { SO_BUF_LOCK, SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK, SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK }, + { SO_SNDBUF, BUFSIZE, SOCK_SNDBUF_LOCK }, + { SO_RCVBUF, BUFSIZE, SOCK_RCVBUF_LOCK } }; + +#define NSOCK ARRAY_SIZE(sk_opts) + +char *type_to_str(int type) +{ + switch (type) { + case SO_BUF_LOCK: + return "SO_BUF_LOCK"; + case SO_SNDBUFFORCE: + return "SO_SNDBUFFORCE"; + case SO_RCVBUFFORCE: + return "SO_RCVBUFFORCE"; + } + return NULL; +} + +int main(int argc, char **argv) +{ + int sock[NSOCK]; + int ret, i; + int exit_code = 1; + + test_init(argc, argv); + + for (i = 0; i < NSOCK; i++) + sock[i] = -1; + + for (i = 0; i < NSOCK; i++) { + uint32_t tmp; + socklen_t len; + + sock[i] = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock[i] < 0) { + pr_perror("can't create socket %d", i); + goto err; + } + + ret = setsockopt(sock[i], SOL_SOCKET, sk_opts[i].type, &sk_opts[i].val, sizeof(sk_opts[i].val)); + if (ret < 0) { + pr_perror("can't set %s (%u) on socket %d", type_to_str(sk_opts[i].type), sk_opts[i].val, i); + goto err; + } + + len = sizeof(tmp); + ret = getsockopt(sock[i], SOL_SOCKET, SO_BUF_LOCK, &tmp, &len); + if (ret < 0) { + pr_perror("can't get SO_BUF_LOCK from socket %d", i); + goto err; + } + + if (tmp != sk_opts[i].lock) { + fail("SO_BUF_LOCK missmatch %u != %u", tmp, sk_opts[i].lock); + goto err; + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NSOCK; i++) { + uint32_t tmp; + socklen_t len; + + len = sizeof(tmp); + ret = getsockopt(sock[i], SOL_SOCKET, SO_BUF_LOCK, &tmp, &len); + if (ret < 0) { + pr_perror("can't get SO_BUF_LOCK from socket %d", i); + goto err; + } + + if (tmp != sk_opts[i].lock) { + fail("SO_BUF_LOCK missmatch %u != %u", tmp, sk_opts[i].lock); + goto err; + } + } + + pass(); + exit_code = 0; +err: + for (i = 0; i < NSOCK; i++) + close(sock[i]); + + return exit_code; +} diff --git a/test/zdtm/static/sock_opts02.desc b/test/zdtm/static/sock_opts02.desc new file mode 100644 index 0000000000..37d3a63545 --- /dev/null +++ b/test/zdtm/static/sock_opts02.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'sockopt_buf_lock'} diff --git a/test/zdtm/static/socket-tcp-close2.c b/test/zdtm/static/socket-tcp-close2.c new file mode 100644 index 0000000000..697c99f39d --- /dev/null +++ b/test/zdtm/static/socket-tcp-close2.c @@ -0,0 +1,67 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check both dump and restore with tcp_close on TCP_CLOSE sockets"; +const char *test_author = "Bui Quang Minh "; + +static int port = 8880; + +int main(int argc, char **argv) +{ + int fd_s, fd, client; + char c; + + test_init(argc, argv); + signal(SIGPIPE, SIG_IGN); + + fd_s = tcp_init_server(AF_INET, &port); + if (fd_s < 0) { + pr_err("Server initializations failed\n"); + return 1; + } + + client = tcp_init_client(AF_INET, "localhost", port); + if (client < 0) { + pr_err("Client initializations failed\n"); + return 1; + } + + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("Can't accept client\n"); + return 1; + } + close(fd_s); + + shutdown(client, SHUT_WR); + shutdown(fd, SHUT_WR); + + test_daemon(); + test_waitsig(); + + if (read(fd, &c, 1) != 0) { + fail("read server"); + return 1; + } + if (read(client, &c, 1) != 0) { + fail("read client"); + return 1; + } + if (write(client, &c, 1) != -1) { + fail("write client"); + return 1; + } + if (write(fd, &c, 1) != -1) { + fail("write server"); + return 1; + } + + pass(); + return 0; +} diff --git a/test/zdtm/static/socket-tcp-close2.desc b/test/zdtm/static/socket-tcp-close2.desc new file mode 100644 index 0000000000..c53a1f3153 --- /dev/null +++ b/test/zdtm/static/socket-tcp-close2.desc @@ -0,0 +1 @@ +{'opts': '--tcp-close', 'flags': 'reqrst '} diff --git a/test/zdtm/static/socket_queues.c b/test/zdtm/static/socket_queues.c index e30bca0e19..44495f06b9 100644 --- a/test/zdtm/static/socket_queues.c +++ b/test/zdtm/static/socket_queues.c @@ -24,6 +24,12 @@ const char *test_author = "Stanislav Kinsbursky \n"; #define SK_DATA_D1 "packet dgram left" #define SK_DATA_D2 "packet dgram right" +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + int main(int argc, char *argv[]) { int ssk_pair_d[2]; @@ -32,7 +38,7 @@ int main(int argc, char *argv[]) test_init(argc, argv); - if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair_s) == -1) { + if (socketpair(AF_UNIX, SOCK_TYPE, 0, ssk_pair_s) == -1) { fail("socketpair"); exit(1); } diff --git a/test/zdtm/static/socket_queues_seqpacket.c b/test/zdtm/static/socket_queues_seqpacket.c new file mode 120000 index 0000000000..0f3f93ea65 --- /dev/null +++ b/test/zdtm/static/socket_queues_seqpacket.c @@ -0,0 +1 @@ +socket_queues.c \ No newline at end of file diff --git a/test/zdtm/static/sockets00-seqpacket.c b/test/zdtm/static/sockets00-seqpacket.c new file mode 120000 index 0000000000..4bce9fc31d --- /dev/null +++ b/test/zdtm/static/sockets00-seqpacket.c @@ -0,0 +1 @@ +sockets00.c \ No newline at end of file diff --git a/test/zdtm/static/sockets00-seqpacket.desc b/test/zdtm/static/sockets00-seqpacket.desc new file mode 120000 index 0000000000..4beea26423 --- /dev/null +++ b/test/zdtm/static/sockets00-seqpacket.desc @@ -0,0 +1 @@ +sockets00.desc \ No newline at end of file diff --git a/test/zdtm/static/sockets00.c b/test/zdtm/static/sockets00.c index 53890077b9..ac5d7d6fe6 100644 --- a/test/zdtm/static/sockets00.c +++ b/test/zdtm/static/sockets00.c @@ -25,6 +25,12 @@ TEST_OPTION(filename, string, "socket file name", 1); #define TEST_MODE 0640 +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + int main(int argc, char *argv[]) { int ssk_icon[4]; @@ -58,9 +64,9 @@ int main(int argc, char *argv[]) memcpy(addr.sun_path, path, addrlen); addrlen += sizeof(addr.sun_family); - ssk_icon[0] = socket(AF_UNIX, SOCK_STREAM, 0); - ssk_icon[1] = socket(AF_UNIX, SOCK_STREAM, 0); - ssk_icon[2] = socket(AF_UNIX, SOCK_STREAM, 0); + ssk_icon[0] = socket(AF_UNIX, SOCK_TYPE, 0); + ssk_icon[1] = socket(AF_UNIX, SOCK_TYPE, 0); + ssk_icon[2] = socket(AF_UNIX, SOCK_TYPE, 0); if (ssk_icon[0] < 0 || ssk_icon[1] < 0 || ssk_icon[2] < 0) { fail("socket"); exit(1); diff --git a/test/zdtm/static/sockets01-seqpacket.c b/test/zdtm/static/sockets01-seqpacket.c new file mode 120000 index 0000000000..8d51121e10 --- /dev/null +++ b/test/zdtm/static/sockets01-seqpacket.c @@ -0,0 +1 @@ +sockets01.c \ No newline at end of file diff --git a/test/zdtm/static/sockets01.c b/test/zdtm/static/sockets01.c index e35a31fece..f56cd219e3 100644 --- a/test/zdtm/static/sockets01.c +++ b/test/zdtm/static/sockets01.c @@ -30,6 +30,12 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_MSG "test-message" static char buf[sizeof(TEST_MSG)]; +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + int main(int argc, char *argv[]) { int spu[2], spb[2], dpu[2], dpb[2], dpd[2]; @@ -40,14 +46,14 @@ int main(int argc, char *argv[]) signal(SIGPIPE, SIG_IGN); /* spu -- stream pair, unidirectional shutdown */ - if (socketpair(PF_UNIX, SOCK_STREAM, 0, spu) < 0) + if (socketpair(PF_UNIX, SOCK_TYPE, 0, spu) < 0) fin("no stream pair 1"); if (shutdown(spu[0], SHUT_RD) < 0) fin("no stream shutdown 1"); /* spb -- stream pair, bidirectional shutdown */ - if (socketpair(PF_UNIX, SOCK_STREAM, 0, spb) < 0) + if (socketpair(PF_UNIX, SOCK_TYPE, 0, spb) < 0) fin("no stream pair 2"); if (shutdown(spb[0], SHUT_RDWR) < 0) diff --git a/test/zdtm/static/sockets02-seqpacket.c b/test/zdtm/static/sockets02-seqpacket.c new file mode 120000 index 0000000000..b958315999 --- /dev/null +++ b/test/zdtm/static/sockets02-seqpacket.c @@ -0,0 +1 @@ +sockets02.c \ No newline at end of file diff --git a/test/zdtm/static/sockets02.c b/test/zdtm/static/sockets02.c index 2729ade2c3..d7d84d8152 100644 --- a/test/zdtm/static/sockets02.c +++ b/test/zdtm/static/sockets02.c @@ -16,6 +16,12 @@ const char *test_doc = "Test semi-closed unix stream connection\n"; const char *test_author = "Pavel Emelyanov \n"; +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + int main(int argc, char *argv[]) { int ssk_pair[2], ret; @@ -25,7 +31,7 @@ int main(int argc, char *argv[]) data = (char)lrand48(); - if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair) == -1) { + if (socketpair(AF_UNIX, SOCK_TYPE, 0, ssk_pair) == -1) { fail("socketpair"); exit(1); } diff --git a/test/zdtm/static/sockets03-seqpacket.c b/test/zdtm/static/sockets03-seqpacket.c new file mode 120000 index 0000000000..997cce6735 --- /dev/null +++ b/test/zdtm/static/sockets03-seqpacket.c @@ -0,0 +1 @@ +sockets03.c \ No newline at end of file diff --git a/test/zdtm/static/sockets03-seqpacket.desc b/test/zdtm/static/sockets03-seqpacket.desc new file mode 120000 index 0000000000..3798a8242f --- /dev/null +++ b/test/zdtm/static/sockets03-seqpacket.desc @@ -0,0 +1 @@ +sockets03.desc \ No newline at end of file diff --git a/test/zdtm/static/sockets03.c b/test/zdtm/static/sockets03.c index cd6f608311..6b0915aaa6 100644 --- a/test/zdtm/static/sockets03.c +++ b/test/zdtm/static/sockets03.c @@ -22,6 +22,12 @@ const char *test_author = "Andrey Ryabinin "; char *filename; TEST_OPTION(filename, string, "socket file name", 1); +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + int main(int argc, char *argv[]) { int sk[3]; @@ -52,8 +58,8 @@ int main(int argc, char *argv[]) memcpy(addr.sun_path, path, addrlen); addrlen += sizeof(addr.sun_family); - sk[0] = socket(AF_UNIX, SOCK_STREAM, 0); - sk[1] = socket(AF_UNIX, SOCK_STREAM, 0); + sk[0] = socket(AF_UNIX, SOCK_TYPE, 0); + sk[1] = socket(AF_UNIX, SOCK_TYPE, 0); if (sk[0] < 0 || sk[1] < 0) { fail("socket"); exit(1); diff --git a/test/zdtm/static/sockets_spair.c b/test/zdtm/static/sockets_spair.c index 2dbb132aac..202c2e7901 100644 --- a/test/zdtm/static/sockets_spair.c +++ b/test/zdtm/static/sockets_spair.c @@ -18,6 +18,12 @@ const char *test_author = "Cyrill Gorcunov