diff --git a/.github/actions/veristat_baseline_compare/action.yml b/.github/actions/veristat_baseline_compare/action.yml new file mode 100644 index 000000000000..9e25a2b0bf1c --- /dev/null +++ b/.github/actions/veristat_baseline_compare/action.yml @@ -0,0 +1,49 @@ +name: 'run-veristat' +description: 'Run veristat benchmark' +inputs: + veristat_output: + description: 'Veristat output filepath' + required: true + baseline_name: + description: 'Veristat baseline cache name' + required: true +runs: + using: "composite" + steps: + - uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.baseline_name }} + if-no-files-found: error + path: ${{ github.workspace }}/${{ inputs.veristat_output }} + + # For pull request: + # - get baseline log from cache + # - compare it to current run + - if: ${{ github.event_name == 'pull_request' }} + uses: actions/cache/restore@v4 + with: + key: ${{ inputs.baseline_name }} + restore-keys: | + ${{ inputs.baseline_name }}- + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' + + - if: ${{ github.event_name == 'pull_request' }} + name: Show veristat comparison + shell: bash + run: ./.github/scripts/compare-veristat-results.sh + env: + BASELINE_PATH: ${{ github.workspace }}/${{ inputs.baseline_name }} + VERISTAT_OUTPUT: ${{ inputs.veristat_output }} + + # For push: just put baseline log to cache + - if: ${{ github.event_name == 'push' }} + shell: bash + run: | + mv "${{ github.workspace }}/${{ inputs.veristat_output }}" \ + "${{ github.workspace }}/${{ inputs.baseline_name }}" + + - if: ${{ github.event_name == 'push' }} + uses: actions/cache/save@v4 + with: + key: ${{ inputs.baseline_name }}-${{ github.run_id }} + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' diff --git a/.github/scripts/compare-veristat-results.sh b/.github/scripts/compare-veristat-results.sh new file mode 100755 index 000000000000..f95c3c192d80 --- /dev/null +++ b/.github/scripts/compare-veristat-results.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ ! -f "${BASELINE_PATH}" ]]; then + echo "# No ${BASELINE_PATH} available" >> "${GITHUB_STEP_SUMMARY}" + + echo "No ${BASELINE_PATH} available" + echo "Printing veristat results" + cat "${VERISTAT_OUTPUT}" + + exit +fi + +selftests/bpf/veristat \ + --output-format csv \ + --emit file,prog,verdict,states \ + --compare "${BASELINE_PATH}" "${VERISTAT_OUTPUT}" > compare.csv + +python3 ./.github/scripts/veristat_compare.py compare.csv diff --git a/.github/scripts/get-commit-metadata.sh b/.github/scripts/get-commit-metadata.sh new file mode 100644 index 000000000000..f178786e4e58 --- /dev/null +++ b/.github/scripts/get-commit-metadata.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +branch="${GITHUB_BASE_REF}" + +if [ "${GITHUB_EVENT_NAME}" = 'push' ]; then + branch="${GITHUB_REF_NAME}" +fi + +echo "branch=${branch}" >> "${GITHUB_OUTPUT}" + +upstream="${branch//_base/}" +commit="$( + git rev-parse "origin/${upstream}" &> /dev/null \ + || ( + git fetch --quiet --prune --no-tags --depth=1 --no-recurse-submodules origin "+refs/heads/${upstream}:refs/remotes/origin/${upstream}" && \ + git rev-parse "origin/${upstream}" + ) +)" +timestamp_utc="$(TZ=utc git show --format='%cd' --no-patch --date=iso-strict-local "${commit}")" + +echo "timestamp=${timestamp_utc}" >> "${GITHUB_OUTPUT}" +echo "commit=${commit}" >> "${GITHUB_OUTPUT}" +echo "Most recent upstream commit is ${commit}" diff --git a/.github/scripts/matrix.py b/.github/scripts/matrix.py new file mode 100644 index 000000000000..c339021534df --- /dev/null +++ b/.github/scripts/matrix.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +import os +import dataclasses +import json + +from enum import Enum +from typing import Any, Dict, List, Final, Set, Union + +MANAGED_OWNER: Final[str] = "kernel-patches" +MANAGED_REPOS: Final[Set[str]] = { + f"{MANAGED_OWNER}/bpf", + f"{MANAGED_OWNER}/vmtest", +} +# We need to run on ubuntu 20.04 because our rootfs is based on debian buster and we +# otherwise get library versioning issue such as +# `./test_verifier: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by ./test_verifier)` +DEFAULT_RUNNER: Final[str] = "ubuntu-20.04" +DEFAULT_LLVM_VERSION: Final[int] = 17 +DEFAULT_SELF_HOSTED_RUNNER_TAGS: Final[List[str]] = ["self-hosted", "docker-noble-main"] + + +class Arch(str, Enum): + """ + CPU architecture supported by CI. + """ + + AARCH64 = "aarch64" + S390X = "s390x" + X86_64 = "x86_64" + + +class Compiler(str, Enum): + GCC = "gcc" + LLVM = "llvm" + + +@dataclasses.dataclass +class Toolchain: + compiler: Compiler + # This is relevant ONLY for LLVM and should not be required for GCC + version: int + + @property + def short_name(self) -> str: + return str(self.compiler.value) + + @property + def full_name(self) -> str: + if self.compiler == Compiler.GCC: + return self.short_name + + return f"{self.short_name}-{self.version}" + + def to_dict(self) -> Dict[str, Union[str, int]]: + return { + "name": self.short_name, + "fullname": self.full_name, + "version": self.version, + } + + +@dataclasses.dataclass +class BuildConfig: + arch: Arch + toolchain: Toolchain + kernel: str = "LATEST" + run_veristat: bool = False + parallel_tests: bool = False + build_release: bool = False + + @property + def runs_on(self) -> List[str]: + if is_managed_repo(): + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [self.arch.value] + return [DEFAULT_RUNNER] + + @property + def build_runs_on(self) -> List[str]: + if is_managed_repo(): + # Build s390x on x86_64 + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [ + self.arch.value == "s390x" and Arch.X86_64.value or self.arch.value, + ] + return [DEFAULT_RUNNER] + + @property + def tests(self) -> Dict[str, Any]: + tests_list = [ + "test_progs", + "test_progs_parallel", + "test_progs_no_alu32", + "test_progs_no_alu32_parallel", + "test_verifier", + ] + + if self.arch.value != "s390x": + tests_list.append("test_maps") + + if self.toolchain.version >= 18: + tests_list.append("test_progs_cpuv4") + + if not self.parallel_tests: + tests_list = [test for test in tests_list if not test.endswith("parallel")] + + return {"include": [generate_test_config(test) for test in tests_list]} + + def to_dict(self) -> Dict[str, Any]: + return { + "arch": self.arch.value, + "toolchain": self.toolchain.to_dict(), + "kernel": self.kernel, + "run_veristat": self.run_veristat, + "parallel_tests": self.parallel_tests, + "build_release": self.build_release, + "runs_on": self.runs_on, + "tests": self.tests, + "build_runs_on": self.build_runs_on, + } + + +def is_managed_repo() -> bool: + return ( + os.environ["GITHUB_REPOSITORY_OWNER"] == MANAGED_OWNER + and os.environ["GITHUB_REPOSITORY"] in MANAGED_REPOS + ) + + +def set_output(name, value): + """Write an output variable to the GitHub output file.""" + with open(os.getenv("GITHUB_OUTPUT"), "a", encoding="utf-8") as file: + file.write(f"{name}={value}\n") + + +def generate_test_config(test: str) -> Dict[str, Union[str, int]]: + """Create the configuration for the provided test.""" + is_parallel = test.endswith("_parallel") + config = { + "test": test, + "continue_on_error": is_parallel, + # While in experimental mode, parallel jobs may get stuck + # anywhere, including in user space where the kernel won't detect + # a problem and panic. We add a second layer of (smaller) timeouts + # here such that if we get stuck in a parallel run, we hit this + # timeout and fail without affecting the overall job success (as + # would be the case if we hit the job-wide timeout). For + # non-experimental jobs, 360 is the default which will be + # superseded by the overall workflow timeout (but we need to + # specify something). + "timeout_minutes": 30 if is_parallel else 360, + } + return config + + +if __name__ == "__main__": + matrix = [ + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + run_veristat=True, + parallel_tests=True, + ), + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.LLVM, version=DEFAULT_LLVM_VERSION), + build_release=True, + ), + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.LLVM, version=18), + build_release=True, + ), + BuildConfig( + arch=Arch.AARCH64, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + ), + # BuildConfig( + # arch=Arch.AARCH64, + # toolchain=Toolchain( + # compiler=Compiler.LLVM, + # version=DEFAULT_LLVM_VERSION + # ), + # ), + BuildConfig( + arch=Arch.S390X, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + ), + ] + + # Outside of those repositories we only run on x86_64 + if not is_managed_repo(): + matrix = [config for config in matrix if config.arch == Arch.X86_64] + + json_matrix = json.dumps({"include": [config.to_dict() for config in matrix]}) + print(json_matrix) + set_output("build_matrix", json_matrix) diff --git a/.github/scripts/prepare-incremental-builds.sh b/.github/scripts/prepare-incremental-builds.sh new file mode 100644 index 000000000000..17f825480ff4 --- /dev/null +++ b/.github/scripts/prepare-incremental-builds.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -eu + +commit_id="${1}" + +# $1 - the SHA-1 to fetch and check out +fetch_and_checkout() { + local build_base_sha + + build_base_sha="${1}" + # If cached artifacts became stale for one reason or another, we + # may not have the build base SHA available. Fetch it and retry. + git fetch origin "${build_base_sha}" && git checkout --quiet "${build_base_sha}" +} + +# $1 - value of KBUILD_OUTPUT +clear_cache_artifacts() { + local output_dir + + output_dir="${1}" + echo "Unable to find earlier upstream ref. Discarding KBUILD_OUTPUT contents..." + rm --recursive --force "${output_dir}" + mkdir "${output_dir}" + false +} + +# $1 - value of KBUILD_OUTPUT +# $2 - current time in ISO 8601 format +restore_source_code_times() { + local build_output + local current_time + local src_time + local obj_time + + build_output="${1}" + current_time="${2}" + src_time="$(date --iso-8601=ns --date="${current_time} - 2 minutes")" + obj_time="$(date --iso-8601=ns --date="${current_time} - 1 minute")" + + git ls-files | xargs --max-args=10000 touch -m --no-create --date="${src_time}" + find "${build_output}" -type f | xargs --max-args=10000 touch -m --no-create --date="${obj_time}" + git checkout --quiet - + echo "Adjusted src and obj time stamps relative to system time" +} + +mkdir --parents "${KBUILD_OUTPUT}" +current_time="$(date --iso-8601=ns)" + +if [ -f "${KBUILD_OUTPUT}/.build-base-sha" ]; then + build_base_sha="$(cat "${KBUILD_OUTPUT}/.build-base-sha")" + echo "Setting up base build state for ${build_base_sha}" + + ( + git checkout --quiet "${build_base_sha}" \ + || fetch_and_checkout "${build_base_sha}" \ + || clear_cache_artifacts "${KBUILD_OUTPUT}" + ) && restore_source_code_times "${KBUILD_OUTPUT}" "${current_time}" +else + echo "No previous build data found" +fi + +echo -n "${commit_id}" > "${KBUILD_OUTPUT}/.build-base-sha" diff --git a/.github/scripts/tar-artifact.sh b/.github/scripts/tar-artifact.sh new file mode 100644 index 000000000000..8886cd6abde4 --- /dev/null +++ b/.github/scripts/tar-artifact.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +set -eux + +arch="${1}" +toolchain="${2}" + +# Convert a platform (as returned by uname -m) to the kernel +# arch (as expected by ARCH= env). +platform_to_kernel_arch() { + case $1 in + s390x) + echo "s390" + ;; + aarch64) + echo "arm64" + ;; + riscv64) + echo "riscv" + ;; + x86_64) + echo "x86" + ;; + *) + echo "$1" + ;; + esac +} + +# Remove intermediate object files that we have no use for. Ideally +# we'd just exclude them from tar below, but it does not provide +# options to express the precise constraints. +find selftests/ -name "*.o" -a ! -name "*.bpf.o" -print0 | \ + xargs --null --max-args=10000 rm + +# Strip debug information, which is excessively large (consuming +# bandwidth) while not actually being used (the kernel does not use +# DWARF to symbolize stacktraces). +"${arch}"-linux-gnu-strip --strip-debug "${KBUILD_OUTPUT}"/vmlinux + +additional_file_list=() +if [ "${GITHUB_REPOSITORY}" == "kernel-patches/vmtest" ]; then + # Package up a bunch of additional infrastructure to support running + # 'make kernelrelease' and bpf tool checks later on. + mapfile -t additional_file_list < <(find . -iname Makefile) + additional_file_list+=( + "scripts/" + "tools/testing/selftests/bpf/" + "tools/include/" + "tools/bpf/bpftool/" + ) +fi + +image_name=$(make ARCH="$(platform_to_kernel_arch "${arch}")" -s image_name) + +# zstd is installed by default in the runner images. +tar -cf - \ + "${KBUILD_OUTPUT}/.config" \ + "${KBUILD_OUTPUT}/${image_name}" \ + "${KBUILD_OUTPUT}/include/config/auto.conf" \ + "${KBUILD_OUTPUT}/include/generated/autoconf.h" \ + "${KBUILD_OUTPUT}/vmlinux" \ + "${additional_file_list[@]}" \ + --exclude '*.cmd' \ + --exclude '*.d' \ + --exclude '*.h' \ + --exclude '*.output' \ + selftests/bpf/ | zstd -T0 -19 -o "vmlinux-${arch}-${toolchain}.tar.zst" diff --git a/.github/scripts/tests/test_veristat_compare.py b/.github/scripts/tests/test_veristat_compare.py new file mode 100644 index 000000000000..b65b69295235 --- /dev/null +++ b/.github/scripts/tests/test_veristat_compare.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import unittest +from typing import Iterable, List + +from ..veristat_compare import parse_table, VeristatFields + + +def gen_csv_table(records: Iterable[str]) -> List[str]: + return [ + ",".join(VeristatFields.headers()), + *records, + ] + + +class TestVeristatCompare(unittest.TestCase): + def test_parse_table_ignore_new_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,N/A,success,N/A,N/A,1,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_ignore_removed_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,N/A,N/A,1,N/A,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_new_failure(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,failure,MISMATCH,1,1,+0 (+0.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [["prog_file.bpf.o", "prog_name", "success -> failure (!!)", "+0.00 %"]], + ) + self.assertTrue(veristat_info.changes) + self.assertTrue(veristat_info.new_failures) + + def test_parse_table_new_changes(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,failure,success,MISMATCH,0,0,+0 (+0.00%)", + "prog_file.bpf.o,prog_name_increase,failure,failure,MATCH,1,2,+1 (+100.00%)", + "prog_file.bpf.o,prog_name_decrease,success,success,MATCH,1,1,-1 (-100.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [ + ["prog_file.bpf.o", "prog_name", "failure -> success", "+0.00 %"], + ["prog_file.bpf.o", "prog_name_increase", "failure", "+100.00 %"], + ["prog_file.bpf.o", "prog_name_decrease", "success", "-100.00 %"], + ], + ) + self.assertTrue(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/veristat_compare.py b/.github/scripts/veristat_compare.py new file mode 100644 index 000000000000..07271b8cbd3a --- /dev/null +++ b/.github/scripts/veristat_compare.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +# This script reads a CSV file produced by the following invocation: +# +# veristat --emit file,prog,verdict,states \ +# --output-format csv \ +# --compare ... +# +# And produces a markdown summary for the file. +# The summary is printed to standard output and appended to a file +# pointed to by GITHUB_STEP_SUMMARY variable. +# +# Script exits with return code 1 if there are new failures in the +# veristat results. +# +# For testing purposes invoke as follows: +# +# GITHUB_STEP_SUMMARY=/dev/null python3 veristat-compare.py test.csv +# +# File format (columns): +# 0. file_name +# 1. prog_name +# 2. verdict_base +# 3. verdict_comp +# 4. verdict_diff +# 5. total_states_base +# 6. total_states_comp +# 7. total_states_diff +# +# Records sample: +# file-a,a,success,failure,MISMATCH,12,12,+0 (+0.00%) +# file-b,b,success,success,MATCH,67,67,+0 (+0.00%) +# +# For better readability suffixes '_OLD' and '_NEW' +# are used instead of '_base' and '_comp' for variable +# names etc. + +import io +import os +import sys +import re +import csv +import logging +import argparse +import enum +from dataclasses import dataclass +from typing import Dict, Iterable, List, Final + + +TRESHOLD_PCT: Final[int] = 0 + +SUMMARY_HEADERS = ["File", "Program", "Verdict", "States Diff (%)"] + +# expected format: +0 (+0.00%) / -0 (-0.00%) +TOTAL_STATES_DIFF_REGEX = ( + r"(?P[+-]\d+) \((?P[+-]\d+\.\d+)\%\)" +) + + +TEXT_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +{table} +""".strip() +) + +HTML_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +
+Click to expand + +{table} +
+""".strip() +) + +GITHUB_MARKUP_REPLACEMENTS: Final[Dict[str, str]] = { + "->": "→", + "(!!)": ":bangbang:", +} + +NEW_FAILURE_SUFFIX: Final[str] = "(!!)" + + +class VeristatFields(str, enum.Enum): + FILE_NAME = "file_name" + PROG_NAME = "prog_name" + VERDICT_OLD = "verdict_base" + VERDICT_NEW = "verdict_comp" + VERDICT_DIFF = "verdict_diff" + TOTAL_STATES_OLD = "total_states_base" + TOTAL_STATES_NEW = "total_states_comp" + TOTAL_STATES_DIFF = "total_states_diff" + + @classmethod + def headers(cls) -> List[str]: + return [ + cls.FILE_NAME, + cls.PROG_NAME, + cls.VERDICT_OLD, + cls.VERDICT_NEW, + cls.VERDICT_DIFF, + cls.TOTAL_STATES_OLD, + cls.TOTAL_STATES_NEW, + cls.TOTAL_STATES_DIFF, + ] + + +@dataclass +class VeristatInfo: + table: list + changes: bool + new_failures: bool + + def get_results_title(self) -> str: + if self.new_failures: + return "There are new veristat failures" + + if self.changes: + return "There are changes in verification performance" + + return "No changes in verification performance" + + def get_results_summary(self, markup: bool = False) -> str: + title = self.get_results_title() + if not self.table: + return f"# {title}\n" + + template = TEXT_SUMMARY_TEMPLATE + table = format_table(headers=SUMMARY_HEADERS, rows=self.table) + + if markup: + template = HTML_SUMMARY_TEMPLATE + table = github_markup_decorate(table) + + return template.format(title=title, table=table) + + +def get_state_diff(value: str) -> float: + if value == "N/A": + return 0.0 + + matches = re.match(TOTAL_STATES_DIFF_REGEX, value) + if not matches: + raise ValueError(f"Failed to parse total states diff field value '{value}'") + + if percentage_diff := matches.group("percentage_diff"): + return float(percentage_diff) + + raise ValueError(f"Invalid {VeristatFields.TOTAL_STATES_DIFF} field value: {value}") + + +def parse_table(csv_file: Iterable[str]) -> VeristatInfo: + reader = csv.DictReader(csv_file) + assert reader.fieldnames == VeristatFields.headers() + + new_failures = False + changes = False + table = [] + + for record in reader: + add = False + + verdict_old, verdict_new = ( + record[VeristatFields.VERDICT_OLD], + record[VeristatFields.VERDICT_NEW], + ) + + # Ignore results from completely new and removed programs + if "N/A" in [verdict_new, verdict_old]: + continue + + if record[VeristatFields.VERDICT_DIFF] == "MISMATCH": + changes = True + add = True + verdict = f"{verdict_old} -> {verdict_new}" + if verdict_new == "failure": + new_failures = True + verdict += f" {NEW_FAILURE_SUFFIX}" + else: + verdict = record[VeristatFields.VERDICT_NEW] + + diff = get_state_diff(record[VeristatFields.TOTAL_STATES_DIFF]) + if abs(diff) > TRESHOLD_PCT: + changes = True + add = True + + if not add: + continue + + table.append( + [ + record[VeristatFields.FILE_NAME], + record[VeristatFields.PROG_NAME], + verdict, + f"{diff:+.2f} %", + ] + ) + + return VeristatInfo(table=table, changes=changes, new_failures=new_failures) + + +def github_markup_decorate(input_str: str) -> str: + for text, markup in GITHUB_MARKUP_REPLACEMENTS.items(): + input_str = input_str.replace(text, markup) + return input_str + + +def format_table(headers: List[str], rows: List[List[str]]) -> str: + column_width = [ + max(len(row[column_idx]) for row in [headers] + rows) + for column_idx in range(len(headers)) + ] + + # Row template string in the following format: + # "{0:8}|{1:10}|{2:15}|{3:7}|{4:10}" + row_template = "|".join( + f"{{{idx}:{width}}}" for idx, width in enumerate(column_width) + ) + row_template_nl = f"|{row_template}|\n" + + with io.StringIO() as out: + out.write(row_template_nl.format(*headers)) + + separator_row = ["-" * width for width in column_width] + out.write(row_template_nl.format(*separator_row)) + + for row in rows: + row_str = row_template_nl.format(*row) + out.write(row_str) + + return out.getvalue() + + +def main(compare_csv_filename: os.PathLike, output_filename: os.PathLike) -> None: + with open(compare_csv_filename, newline="", encoding="utf-8") as csv_file: + veristat_results = parse_table(csv_file) + + sys.stdout.write(veristat_results.get_results_summary()) + + with open(output_filename, encoding="utf-8", mode="a") as file: + file.write(veristat_results.get_results_summary(markup=True)) + + if veristat_results.new_failures: + return 1 + + return 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Print veristat comparison output as markdown step summary" + ) + parser.add_argument("filename") + args = parser.parse_args() + summary_filename = os.getenv("GITHUB_STEP_SUMMARY") + if not summary_filename: + logging.error("GITHUB_STEP_SUMMARY environment variable is not set") + sys.exit(1) + sys.exit(main(args.filename, summary_filename)) diff --git a/.github/workflows/kernel-build-test.yml b/.github/workflows/kernel-build-test.yml new file mode 100644 index 000000000000..6296b36b9742 --- /dev/null +++ b/.github/workflows/kernel-build-test.yml @@ -0,0 +1,116 @@ +name: Reusable Build/Test/Veristat workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + build_runs_on: + required: true + type: string + description: The runners to run the builds on. This is a json string representing an array of labels. + llvm-version: + required: true + type: string + description: The version of LLVM used to build selftest.... for llvm toolchain, this should match the one from toolchain_full, for gcc it is an arbritrary version we decide to build selftests against. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + tests: + required: true + type: string + description: A serialized json array with the tests to be running, it must follow the json-matrix format, https://www.jitsejan.com/use-github-actions-with-json-file-as-matrix + run_veristat: + required: true + type: boolean + description: Whether or not to run the veristat job. + run_tests: + required: true + type: boolean + description: Whether or not to run the test job. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + build_release: + required: true + type: boolean + description: Build selftests with -O2 optimization in addition to non-optimized build. + default: false + secrets: + AWS_ROLE_ARN: + required: true + +jobs: + # Build kernel and selftest + build: + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + llvm-version: ${{ inputs.llvm-version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + build-release: + if: ${{ inputs.build_release }} + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.runs_on }} + llvm-version: ${{ inputs.llvm-version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + release: true + test: + if: ${{ inputs.run_tests }} + uses: ./.github/workflows/kernel-test.yml + # Setting name to test here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "test" + needs: [build] + strategy: + fail-fast: false + matrix: ${{ fromJSON(inputs.tests) }} + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + kernel: ${{ inputs.kernel }} + test: ${{ matrix.test }} + continue_on_error: ${{ toJSON(matrix.continue_on_error) }} + timeout_minutes: ${{ matrix.timeout_minutes }} + + veristat: + if: ${{ inputs.run_veristat }} + uses: ./.github/workflows/kernel-veristat.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + aws_region: ${{ vars.AWS_REGION }} + runs_on: ${{ inputs.runs_on }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} diff --git a/.github/workflows/kernel-build.yml b/.github/workflows/kernel-build.yml new file mode 100644 index 000000000000..cf378dd98032 --- /dev/null +++ b/.github/workflows/kernel-build.yml @@ -0,0 +1,142 @@ + +name: Reusable build workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + llvm-version: + required: true + type: string + description: The version of LLVM used to build selftest.... for llvm toolchain, this should match the one from toolchain_full, for gcc it is an arbritrary version we decide to build selftests against. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + release: + required: false + type: boolean + description: Build selftest with -O2 optimization + default: false + +jobs: + build: + name: build for ${{ inputs.arch }} with ${{ inputs.toolchain_full }}${{ inputs.release && '-O2' || '' }} + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + env: + KERNEL: ${{ inputs.kernel }} + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + steps: + - uses: actions/checkout@v4 + # We fetch an actual bit of history here to facilitate incremental + # builds (which may check out some earlier upstream change). + with: + fetch-depth: 50 + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + uses: libbpf/ci/get-linux-source@main + with: + dest: '.kernel' + - if: ${{ inputs.download_sources }} + name: Move linux source in place + shell: bash + run: | + rm -rf .kernel/.git + cp -rf .kernel/. . + rm -rf .kernel + - name: Get commit meta-data + id: get-commit-metadata + run: | + bash .github/scripts/get-commit-metadata.sh + - name: Pull recent KBUILD_OUTPUT contents + uses: actions/cache@v4 + with: + path: ${{ env.KBUILD_OUTPUT }} + key: kbuild-output-${{ inputs.arch }}-${{ inputs.toolchain_full }}-${{ steps.get-commit-metadata.outputs.branch }}-${{ steps.get-commit-metadata.outputs.timestamp }}-${{ steps.get-commit-metadata.outputs.commit }} + restore-keys: | + kbuild-output-${{ inputs.arch }}-${{ inputs.toolchain_full }}-${{ steps.get-commit-metadata.outputs.branch }}-${{ steps.get-commit-metadata.outputs.timestamp }}- + kbuild-output-${{ inputs.arch }}-${{ inputs.toolchain_full }}-${{ steps.get-commit-metadata.outputs.branch }}- + kbuild-output-${{ inputs.arch }}-${{ inputs.toolchain_full }}- + - name: Prepare incremental build + shell: bash + run: | + bash .github/scripts/prepare-incremental-builds.sh ${{ steps.get-commit-metadata.outputs.commit }} + - uses: libbpf/ci/patch-kernel@main + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: '${{ github.workspace }}' + - name: Setup build environment + uses: libbpf/ci/setup-build-env@main + with: + arch: ${{ inputs.arch }} + llvm-version: ${{ inputs.llvm-version }} + pahole: c2f89dab3f2b0ebb53bab3ed8be32f41cb743c37 + - name: Build kernel image + uses: libbpf/ci/build-linux@main + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm-version }} + - name: Build selftests + uses: libbpf/ci/build-selftests@main + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm-version }} + env: + # RELEASE= disables all optimizaions + # RELEASE=0 adds -O0 make flag + # RELEASE=1 adds -O2 make flag + RELEASE: ${{ inputs.release && '1' || '' }} + - if: ${{ github.event_name != 'push' }} + name: Build samples + uses: libbpf/ci/build-samples@main + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm-version }} + - name: Tar artifacts + run: | + bash .github/scripts/tar-artifact.sh ${{ inputs.arch }} ${{ inputs.toolchain_full }} + - if: ${{ github.event_name != 'push' }} + name: Remove KBUILD_OUTPUT content + shell: bash + run: | + # Remove $KBUILD_OUTPUT to prevent cache creation for pull requests. + # Only on pushed changes are build artifacts actually cached, because + # of github.com/actions/cache's cache isolation logic. + rm -rf "${KBUILD_OUTPUT}" + - uses: actions/upload-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}${{ inputs.release && '-release' || '' }} + if-no-files-found: error + path: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst diff --git a/.github/workflows/kernel-test.yml b/.github/workflows/kernel-test.yml new file mode 100644 index 000000000000..38d2d664e126 --- /dev/null +++ b/.github/workflows/kernel-test.yml @@ -0,0 +1,69 @@ +name: Reusable test workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + test: + required: true + type: string + description: The test to run in the vm, e.g test_progs, test_maps, test_progs_no_alu32... + continue_on_error: + required: true + type: string + description: Whether to continue on error. This is typically set to true for parallel tests which are currently known to fail, but we don't want to fail the whole CI because of that. + timeout_minutes: + required: true + type: number + description: In case a test runs for too long, after how many seconds shall we timeout and error. + +jobs: + test: + name: ${{ inputs.test }} on ${{ inputs.arch }} with ${{ inputs.toolchain_full }} + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + env: + KERNEL: ${{ inputs.kernel }} + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + CONTINUE_ON_ERROR: ${{ inputs.continue_on_error }} + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: . + - name: Untar artifacts + # zstd is installed by default in the runner images. + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + - name: Run selftests + uses: libbpf/ci/run-vmtest@main + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + continue-on-error: ${{ fromJSON(env.CONTINUE_ON_ERROR) }} + timeout-minutes: ${{ inputs.timeout_minutes }} + with: + arch: ${{ inputs.arch}} + img: '/tmp/root.img' + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: ${{ inputs.test }} \ No newline at end of file diff --git a/.github/workflows/kernel-veristat.yml b/.github/workflows/kernel-veristat.yml new file mode 100644 index 000000000000..03a3c4dc8f39 --- /dev/null +++ b/.github/workflows/kernel-veristat.yml @@ -0,0 +1,96 @@ +name: Reusable veristat workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + aws_region: + required: true + type: string + description: The AWS region where we pull bpf objects to run against veristat. + secrets: + AWS_ROLE_ARN: + required: true + description: The AWS role used by GH to pull BPF objects from AWS. + +jobs: + veristat: + name: veristat on ${{ inputs.arch }} with ${{ inputs.toolchain }} + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain }} + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + - name: Untar artifacts + # zstd is installed by default in the runner images. + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Configure AWS Credentials + # Disabling BPF objects download and veristat-meta benchmark for PRs + # created from fork repositories. These won't have access to required + # enviroment variables and secrets, and otherwise would consistently fail + if: ${{ github.event.pull_request.head.repo.full_name == github.repository }} + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ inputs.aws_region }} + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + role-session-name: github-action-bpf-ci + + - name: Download BPF objects + if: ${{ github.event.pull_request.head.repo.full_name == github.repository }} + run: | + set -eux + if [ -n "$AWS_ROLE_ARN" ]; then + mkdir ./bpf_objects + aws s3 sync s3://veristat-bpf-binaries ./bpf_objects + fi + env: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + - name: Run veristat + uses: libbpf/ci/run-vmtest@main + with: + arch: x86_64 + img: '/tmp/root.img' + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + # Don't run meta's veristat from forked repo. + kernel-test: ${{ github.event.pull_request.head.repo.full_name == github.repository && 'run_veristat_kernel,run_veristat_meta' || 'run_veristat_kernel' }} + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.kernel.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-kernel + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-kernel + + - name: Compare and save veristat.meta.csv + if: ${{ github.event.pull_request.head.repo.full_name == github.repository }} + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-meta + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-meta diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000000..1c910fd29730 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,65 @@ +name: "lint" + +on: + pull_request: + push: + branches: + - master + +jobs: + shellcheck: + # This workflow gets injected into other Linux repositories, but we don't + # want it to run there. + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: ShellCheck + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run ShellCheck + uses: ludeeus/action-shellcheck@master + env: + SHELLCHECK_OPTS: --severity=warning --exclude=SC1091 + + # Ensure some consistency in the formatting. + lint: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run black + uses: psf/black@stable + with: + src: ./.github/scripts + + validate_matrix: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Validate matrix.py + runs-on: ubuntu-latest + env: + GITHUB_REPOSITORY_OWNER: ${{ matrix.owner }} + GITHUB_REPOSITORY: ${{ matrix.repository }} + GITHUB_OUTPUT: /dev/stdout + strategy: + matrix: + owner: ['kernel-patches', 'foo'] + repository: ['bpf', 'vmtest', 'bar'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: run script + run: | + python3 .github/scripts/matrix.py + + unittests: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Unittests + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run unittests + run: python3 -m unittest scripts/tests/*.py + working-directory: .github diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000000..3c81e02bff40 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,58 @@ +name: bpf-ci + +on: + pull_request: + push: + branches: + - bpf_base + - bpf-next_base + +concurrency: + group: ci-test-${{ github.ref_name }} + cancel-in-progress: true + +jobs: + set-matrix: + # FIXME: set-matrix is lightweight, run it on any self-hosted machines for kernel-patches org + # so we do not wait for GH hosted runners when there potentially all are busy because of bpf-rc + # repo for instance. + # This could be somehow fixed long term by making this action/workflow re-usable and letting the called + # specify what to run on. + runs-on: ${{ github.repository_owner == 'kernel-patches' && 'x86_64' || 'ubuntu-latest' }} + outputs: + build-matrix: ${{ steps.set-matrix-impl.outputs.build_matrix }} + steps: + - uses: actions/checkout@v4 + - id: set-matrix-impl + run: | + python3 .github/scripts/matrix.py + + build-and-test: + # Setting name to arch-compiler here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "${{ matrix.arch }}-${{ matrix.toolchain.fullname }}" + uses: ./.github/workflows/kernel-build-test.yml + needs: [set-matrix] + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.set-matrix.outputs.build-matrix) }} + with: + arch: ${{ matrix.arch }} + toolchain_full: ${{ matrix.toolchain.fullname }} + toolchain: ${{ matrix.toolchain.name }} + runs_on: ${{ toJSON(matrix.runs_on) }} + build_runs_on: ${{ toJSON(matrix.build_runs_on) }} + llvm-version: ${{ matrix.toolchain.version }} + kernel: ${{ matrix.kernel }} + tests: ${{ toJSON(matrix.tests) }} + run_veristat: ${{ matrix.run_veristat }} + # We only run tests on pull requests. + run_tests: ${{ github.event_name != 'push' }} + # Download sources + download_sources: ${{ github.repository == 'kernel-patches/vmtest' }} + build_release: ${{ matrix.build_release }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} diff --git a/README b/README index fd903645e6de..e69de29bb2d1 100644 --- a/README +++ b/README @@ -1,18 +0,0 @@ -Linux kernel -============ - -There are several guides for kernel developers and users. These guides can -be rendered in a number of formats, like HTML and PDF. Please read -Documentation/admin-guide/README.rst first. - -In order to build the documentation, use ``make htmldocs`` or -``make pdfdocs``. The formatted documentation can also be read online at: - - https://www.kernel.org/doc/html/latest/ - -There are various text files in the Documentation/ subdirectory, -several of them using the reStructuredText markup notation. - -Please read the Documentation/process/changes.rst file, as it contains the -requirements for building and running the kernel, and information about -the problems which may result by upgrading your kernel. diff --git a/ci/diffs/.keep b/ci/diffs/.keep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch new file mode 100644 index 000000000000..3b6139225e7b --- /dev/null +++ b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch @@ -0,0 +1,33 @@ +From 5440a12ac8fb2a8e051c597fcf5d85b427fe612a Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 13 Oct 2023 12:44:34 -0700 +Subject: [PATCH] Revert "bpf: Avoid unnecessary audit log for CPU security + mitigations" + +This reverts commit 236334aeec0f93217cf9235f2004e61a0a1a5985. +--- + include/linux/bpf.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index f0891ba24cb1..61bde4520f5c 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void) + + static inline bool bpf_bypass_spec_v1(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + static inline bool bpf_bypass_spec_v4(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + int bpf_map_new_fd(struct bpf_map *map, int flags); +-- +2.34.1 + diff --git a/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch new file mode 100644 index 000000000000..63bdd28adedd --- /dev/null +++ b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch @@ -0,0 +1,69 @@ +From c71766e8ff7a7f950522d25896fba758585500df Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 22 Apr 2024 21:14:40 -0700 +Subject: [PATCH] arch/Kconfig: Move SPECULATION_MITIGATIONS to arch/Kconfig + +SPECULATION_MITIGATIONS is currently defined only for x86. As a result, +IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) is always false for other +archs. f337a6a21e2f effectively set "mitigations=off" by default on +non-x86 archs, which is not desired behavior. Jakub observed this +change when running bpf selftests on s390 and arm64. + +Fix this by moving SPECULATION_MITIGATIONS to arch/Kconfig so that it is +available in all archs and thus can be used safely in kernel/cpu.c + +Fixes: f337a6a21e2f ("x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n") +Cc: stable@vger.kernel.org +Cc: Sean Christopherson +Cc: Ingo Molnar +Cc: Daniel Sneddon +Cc: Jakub Kicinski +Signed-off-by: Song Liu +--- + arch/Kconfig | 10 ++++++++++ + arch/x86/Kconfig | 10 ---------- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 9f066785bb71..8f4af75005f8 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1609,4 +1609,14 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT + # strict alignment always, even with -falign-functions. + def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG + ++menuconfig SPECULATION_MITIGATIONS ++ bool "Mitigations for speculative execution vulnerabilities" ++ default y ++ help ++ Say Y here to enable options which enable mitigations for ++ speculative execution hardware vulnerabilities. ++ ++ If you say N, all mitigations will be disabled. You really ++ should know what you are doing to say so. ++ + endmenu +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 39886bab943a..50c890fce5e0 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2486,16 +2486,6 @@ config PREFIX_SYMBOLS + def_bool y + depends on CALL_PADDING && !CFI_CLANG + +-menuconfig SPECULATION_MITIGATIONS +- bool "Mitigations for speculative execution vulnerabilities" +- default y +- help +- Say Y here to enable options which enable mitigations for +- speculative execution hardware vulnerabilities. +- +- If you say N, all mitigations will be disabled. You really +- should know what you are doing to say so. +- + if SPECULATION_MITIGATIONS + + config MITIGATION_PAGE_TABLE_ISOLATION +-- +2.43.0 + diff --git a/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch new file mode 100644 index 000000000000..a13d76719741 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch @@ -0,0 +1,94 @@ +From fb9a697860acd8f54f2ba6647923794378eb33da Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Sun, 26 Nov 2023 21:03:42 -0800 +Subject: [PATCH] bpf: Fix a few selftest failures due to llvm18 change + +With latest upstream llvm18, the following test cases failed: + + $ ./test_progs -j + #13/2 bpf_cookie/multi_kprobe_link_api:FAIL + #13/3 bpf_cookie/multi_kprobe_attach_api:FAIL + #13 bpf_cookie:FAIL + #77 fentry_fexit:FAIL + #78/1 fentry_test/fentry:FAIL + #78 fentry_test:FAIL + #82/1 fexit_test/fexit:FAIL + #82 fexit_test:FAIL + #112/1 kprobe_multi_test/skel_api:FAIL + #112/2 kprobe_multi_test/link_api_addrs:FAIL + [...] + #112 kprobe_multi_test:FAIL + #356/17 test_global_funcs/global_func17:FAIL + #356 test_global_funcs:FAIL + +Further analysis shows llvm upstream patch [1] is responsible for the above +failures. For example, for function bpf_fentry_test7() in net/bpf/test_run.c, +without [1], the asm code is: + + 0000000000000400 : + 400: f3 0f 1e fa endbr64 + 404: e8 00 00 00 00 callq 0x409 + 409: 48 89 f8 movq %rdi, %rax + 40c: c3 retq + 40d: 0f 1f 00 nopl (%rax) + +... and with [1], the asm code is: + + 0000000000005d20 : + 5d20: e8 00 00 00 00 callq 0x5d25 + 5d25: c3 retq + +... and is called instead of +and this caused test failures for #13/#77 etc. except #356. + +For test case #356/17, with [1] (progs/test_global_func17.c)), the main prog +looks like: + + 0000000000000000 : + 0: b4 00 00 00 2a 00 00 00 w0 = 0x2a + 1: 95 00 00 00 00 00 00 00 exit + +... which passed verification while the test itself expects a verification +failure. + +Let us add 'barrier_var' style asm code in both places to prevent function +specialization which caused selftests failure. + + [1] https://github.com/llvm/llvm-project/pull/72903 + +Signed-off-by: Yonghong Song +Signed-off-by: Daniel Borkmann +Link: https://lore.kernel.org/bpf/20231127050342.1945270-1-yonghong.song@linux.dev +--- + net/bpf/test_run.c | 2 +- + tools/testing/selftests/bpf/progs/test_global_func17.c | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c +index c9fdcc5cdce1..711cf5d59816 100644 +--- a/net/bpf/test_run.c ++++ b/net/bpf/test_run.c +@@ -542,7 +542,7 @@ struct bpf_fentry_test_t { + + int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) + { +- asm volatile (""); ++ asm volatile ("": "+r"(arg)); + return (long)arg; + } + +diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c +index a32e11c7d933..5de44b09e8ec 100644 +--- a/tools/testing/selftests/bpf/progs/test_global_func17.c ++++ b/tools/testing/selftests/bpf/progs/test_global_func17.c +@@ -5,6 +5,7 @@ + + __noinline int foo(int *p) + { ++ barrier_var(p); + return p ? (*p = 42) : 0; + } + +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch new file mode 100644 index 000000000000..5832a4266470 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch @@ -0,0 +1,67 @@ +From dfce9cb3140592b886838e06f3e0c25fea2a9cae Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Thu, 30 Nov 2023 18:46:40 -0800 +Subject: [PATCH 1/1] bpf: Fix a verifier bug due to incorrect branch offset + comparison with cpu=v4 + +Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9 +("bpf: Support new 32bit offset jmp instruction") added support for new +32bit offset jmp instruction. Unfortunately, in function +bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset +(plus/minor a small delta) compares to 16-bit offset bound +[S16_MIN, S16_MAX], which caused the following verification failure: + $ ./test_progs-cpuv4 -t verif_scale_pyperf180 + ... + insn 10 cannot be patched due to 16-bit range + ... + libbpf: failed to load object 'pyperf180.bpf.o' + scale_test:FAIL:expect_success unexpected error: -12 (errno 12) + #405 verif_scale_pyperf180:FAIL + +Note that due to recent llvm18 development, the patch [2] (already applied +in bpf-next) needs to be applied to bpf tree for testing purpose. + +The fix is rather simple. For 32bit offset branch insn, the adjusted +offset compares to [S32_MIN, S32_MAX] and then verification succeeded. + + [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev + [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev + +Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev +--- + kernel/bpf/core.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index cd3afe57ece3..fe254ae035fe 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, s32 curr, const bool probe_pass) + { +- const s32 off_min = S16_MIN, off_max = S16_MAX; ++ s64 off_min, off_max, off; + s32 delta = end_new - end_old; +- s32 off; + +- if (insn->code == (BPF_JMP32 | BPF_JA)) ++ if (insn->code == (BPF_JMP32 | BPF_JA)) { + off = insn->imm; +- else ++ off_min = S32_MIN; ++ off_max = S32_MAX; ++ } else { + off = insn->off; ++ off_min = S16_MIN; ++ off_max = S16_MAX; ++ } + + if (curr < pos && curr + off + 1 >= end_old) + off += delta; +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch new file mode 100644 index 000000000000..ea6b2386d034 --- /dev/null +++ b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch @@ -0,0 +1,40 @@ +From patchwork Fri Aug 2 18:54:34 2024 +From: Yonghong Song +Subject: [PATCH bpf-next] selftests/bpf: Fix a btf_dump selftest failure + +Jakub reported bpf selftest "btf_dump" failure after forwarding to +v6.11-rc1 with netdev. + Error: #33 btf_dump + Error: #33/15 btf_dump/btf_dump: var_data + btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0 + +The reason for the failure is due to + commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables") +where percpu static variable "cpu_profile_flip" is removed. + +Let us replace "cpu_profile_flip" with a variable in bpf subsystem +so whenever that variable gets deleted or renamed, we can detect the +failure immediately. In this case, I picked a static percpu variable +"bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c. + +Reported-by: Jakub Kicinski +Signed-off-by: Yonghong Song +--- + tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +index 09a8e6f9b379..b293b8501fd6 100644 +--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c ++++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +@@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, + "int cpu_number = (int)100", 100); + #endif +- TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, +- "static int cpu_profile_flip = (int)2", 2); ++ TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, ++ "static int bpf_cgrp_storage_busy = (int)2", 2); + } + + static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, diff --git a/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch new file mode 100644 index 000000000000..bd12bd9b3fba --- /dev/null +++ b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch @@ -0,0 +1,99 @@ +From c8268f8e9fa33c32e1f2f86fc7b703408a396c70 Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 27 Oct 2023 11:24:24 -0700 +Subject: [PATCH] net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() + +With latest sync from net-next tree, bpf-next has a bpf selftest failure: + [root@arch-fb-vm1 bpf]# ./test_progs -t setget_sockopt + ... + [ 76.194349] ============================================ + [ 76.194682] WARNING: possible recursive locking detected + [ 76.195039] 6.6.0-rc7-g37884503df08-dirty #67 Tainted: G W OE + [ 76.195518] -------------------------------------------- + [ 76.195852] new_name/154 is trying to acquire lock: + [ 76.196159] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: ip_sock_set_tos+0x19/0x30 + [ 76.196669] + [ 76.196669] but task is already holding lock: + [ 76.197028] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.197517] + [ 76.197517] other info that might help us debug this: + [ 76.197919] Possible unsafe locking scenario: + [ 76.197919] + [ 76.198287] CPU0 + [ 76.198444] ---- + [ 76.198600] lock(sk_lock-AF_INET); + [ 76.198831] lock(sk_lock-AF_INET); + [ 76.199062] + [ 76.199062] *** DEADLOCK *** + [ 76.199062] + [ 76.199420] May be due to missing lock nesting notation + [ 76.199420] + [ 76.199879] 2 locks held by new_name/154: + [ 76.200131] #0: ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.200644] #1: ffffffff90f96a40 (rcu_read_lock){....}-{1:2}, at: __cgroup_bpf_run_filter_sock_ops+0x55/0x290 + [ 76.201268] + [ 76.201268] stack backtrace: + [ 76.201538] CPU: 4 PID: 154 Comm: new_name Tainted: G W OE 6.6.0-rc7-g37884503df08-dirty #67 + [ 76.202134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 + [ 76.202699] Call Trace: + [ 76.202858] + [ 76.203002] dump_stack_lvl+0x4b/0x80 + [ 76.203239] __lock_acquire+0x740/0x1ec0 + [ 76.203503] lock_acquire+0xc1/0x2a0 + [ 76.203766] ? ip_sock_set_tos+0x19/0x30 + [ 76.204050] ? sk_stream_write_space+0x12a/0x230 + [ 76.204389] ? lock_release+0xbe/0x260 + [ 76.204661] lock_sock_nested+0x32/0x80 + [ 76.204942] ? ip_sock_set_tos+0x19/0x30 + [ 76.205208] ip_sock_set_tos+0x19/0x30 + [ 76.205452] do_ip_setsockopt+0x4b3/0x1580 + [ 76.205719] __bpf_setsockopt+0x62/0xa0 + [ 76.205963] bpf_sock_ops_setsockopt+0x11/0x20 + [ 76.206247] bpf_prog_630217292049c96e_bpf_test_sockopt_int+0xbc/0x123 + [ 76.206660] bpf_prog_493685a3bae00bbd_bpf_test_ip_sockopt+0x49/0x4b + [ 76.207055] bpf_prog_b0bcd27f269aeea0_skops_sockopt+0x44c/0xec7 + [ 76.207437] __cgroup_bpf_run_filter_sock_ops+0xda/0x290 + [ 76.207829] __inet_listen_sk+0x108/0x1b0 + [ 76.208122] inet_listen+0x48/0x70 + [ 76.208373] __sys_listen+0x74/0xb0 + [ 76.208630] __x64_sys_listen+0x16/0x20 + [ 76.208911] do_syscall_64+0x3f/0x90 + [ 76.209174] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 + ... + +Both ip_sock_set_tos() and inet_listen() calls lock_sock(sk) which +caused a dead lock. + +To fix the issue, use sockopt_lock_sock() in ip_sock_set_tos() +instead. sockopt_lock_sock() will avoid lock_sock() if it is in bpf +context. + +Fixes: 878d951c6712 ("inet: lock the socket in ip_sock_set_tos()") +Suggested-by: Martin KaFai Lau +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/bpf/20231027182424.1444845-1-yonghong.song@linux.dev +--- + net/ipv4/ip_sockglue.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index 9c68b6b74d9f..2efc53526a38 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -602,9 +602,9 @@ void __ip_sock_set_tos(struct sock *sk, int val) + + void ip_sock_set_tos(struct sock *sk, int val) + { +- lock_sock(sk); ++ sockopt_lock_sock(sk); + __ip_sock_set_tos(sk, val); +- release_sock(sk); ++ sockopt_release_sock(sk); + } + EXPORT_SYMBOL(ip_sock_set_tos); + +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch new file mode 100644 index 000000000000..da5bcdc45596 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch @@ -0,0 +1,51 @@ +From 41c24102af7b6236277a214428b203d51a3462df Mon Sep 17 00:00:00 2001 +From: Stanislav Fomichev +Date: Thu, 25 Jul 2024 14:40:29 -0700 +Subject: [PATCH 1/1] selftests/bpf: Filter out _GNU_SOURCE when compiling + test_cpp + +Jakub reports build failures when merging linux/master with net tree: + +CXX test_cpp +In file included from :454: +:2:9: error: '_GNU_SOURCE' macro redefined [-Werror,-Wmacro-redefined] + 2 | #define _GNU_SOURCE + | ^ +:445:9: note: previous definition is here + 445 | #define _GNU_SOURCE 1 + +The culprit is commit cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to +CFLAGS in lib.mk") which unconditionally added -D_GNU_SOUCE to CLFAGS. +Apparently clang++ also unconditionally adds it for the C++ targets [0] +which causes a conflict. Add small change in the selftests makefile +to filter it out for test_cpp. + +Not sure which tree it should go via, targeting bpf for now, but net +might be better? + +0: https://stackoverflow.com/questions/11670581/why-is-gnu-source-defined-by-default-and-how-to-turn-it-off + +Signed-off-by: Stanislav Fomichev +Signed-off-by: Andrii Nakryiko +Acked-by: Jiri Olsa +Link: https://lore.kernel.org/bpf/20240725214029.1760809-1-sdf@fomichev.me +--- + tools/testing/selftests/bpf/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index dd49c1d23a60..81d4757ecd4c 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + # Make sure we are able to include and link libbpf against c++. + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch new file mode 100644 index 000000000000..4ebfe20b2470 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch @@ -0,0 +1,50 @@ +From f3d2080e8cf23125f79e345061149ae40f66816f Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 3 Jun 2024 23:43:17 -0700 +Subject: [PATCH bpf-next] selftests/bpf: Fix bpf_cookie and find_vma in nested + VM + +bpf_cookie and find_vma are flaky in nested VMs, which is used by some CI +systems. It turns out these failures are caused by unreliable perf event +in nested VM. Fix these by: + + 1. Use PERF_COUNT_SW_CPU_CLOCK in find_vma; + 2. Increase sample_freq in bpf_cookie. + +Signed-off-by: Song Liu +--- + tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 2 +- + tools/testing/selftests/bpf/prog_tests/find_vma.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +index 4407ea428e77..070c52c312e5 100644 +--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c ++++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +@@ -451,7 +451,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; +- attr.sample_freq = 1000; ++ attr.sample_freq = 10000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; +diff --git a/tools/testing/selftests/bpf/prog_tests/find_vma.c b/tools/testing/selftests/bpf/prog_tests/find_vma.c +index 5165b38f0e59..f7619e0ade10 100644 +--- a/tools/testing/selftests/bpf/prog_tests/find_vma.c ++++ b/tools/testing/selftests/bpf/prog_tests/find_vma.c +@@ -29,8 +29,8 @@ static int open_pe(void) + + /* create perf event */ + attr.size = sizeof(attr); +- attr.type = PERF_TYPE_HARDWARE; +- attr.config = PERF_COUNT_HW_CPU_CYCLES; ++ attr.type = PERF_TYPE_SOFTWARE; ++ attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 1000; + pfd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch new file mode 100644 index 000000000000..d55d2e7af865 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch @@ -0,0 +1,78 @@ +From 100888fb6d8a185866b1520031ee7e3182b173de Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 10 Nov 2023 11:36:44 -0800 +Subject: [PATCH] selftests/bpf: Fix pyperf180 compilation failure with clang18 + +With latest clang18 (main branch of llvm-project repo), when building bpf selftests, + [~/work/bpf-next (master)]$ make -C tools/testing/selftests/bpf LLVM=1 -j + +The following compilation error happens: + fatal error: error in backend: Branch target out of insn range + ... + Stack dump: + 0. Program arguments: clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -I/home/yhs/work/bpf-next/tools/include/uapi + -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -idirafter + /home/yhs/work/llvm-project/llvm/build.18/install/lib/clang/18/include -idirafter /usr/local/include + -idirafter /usr/include -Wno-compare-distinct-pointer-types -DENABLE_ATOMICS_TESTS -O2 --target=bpf + -c progs/pyperf180.c -mcpu=v3 -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/pyperf180.bpf.o + 1. parser at end of file + 2. Code generation + ... + +The compilation failure only happens to cpu=v2 and cpu=v3. cpu=v4 is okay +since cpu=v4 supports 32-bit branch target offset. + +The above failure is due to upstream llvm patch [1] where some inlining behavior +are changed in clang18. + +To workaround the issue, previously all 180 loop iterations are fully unrolled. +The bpf macro __BPF_CPU_VERSION__ (implemented in clang18 recently) is used to avoid +unrolling changes if cpu=v4. If __BPF_CPU_VERSION__ is not available and the +compiler is clang18, the unrollng amount is unconditionally reduced. + + [1] https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e + +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Tested-by: Alan Maguire +Link: https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev +--- + tools/testing/selftests/bpf/progs/pyperf180.c | 22 +++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c +index c39f559d3100..42c4a8b62e36 100644 +--- a/tools/testing/selftests/bpf/progs/pyperf180.c ++++ b/tools/testing/selftests/bpf/progs/pyperf180.c +@@ -1,4 +1,26 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (c) 2019 Facebook + #define STACK_MAX_LEN 180 ++ ++/* llvm upstream commit at clang18 ++ * https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e ++ * changed inlining behavior and caused compilation failure as some branch ++ * target distance exceeded 16bit representation which is the maximum for ++ * cpu v1/v2/v3. Macro __BPF_CPU_VERSION__ is later implemented in clang18 ++ * to specify which cpu version is used for compilation. So a smaller ++ * unroll_count can be set if __BPF_CPU_VERSION__ is less than 4, which ++ * reduced some branch target distances and resolved the compilation failure. ++ * ++ * To capture the case where a developer/ci uses clang18 but the corresponding ++ * repo checkpoint does not have __BPF_CPU_VERSION__, a smaller unroll_count ++ * will be set as well to prevent potential compilation failures. ++ */ ++#ifdef __BPF_CPU_VERSION__ ++#if __BPF_CPU_VERSION__ < 4 ++#define UNROLL_COUNT 90 ++#endif ++#elif __clang_major__ == 18 ++#define UNROLL_COUNT 90 ++#endif ++ + #include "pyperf.h" +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch new file mode 100644 index 000000000000..6497a6cc38c9 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch @@ -0,0 +1,41 @@ +From 42839864a62ee244ec280b09149b1cb439f681db Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Fri, 27 Oct 2023 18:25:39 -0700 +Subject: [PATCH bpf-next] selftests/bpf: disable detection of llvm when + building bpftool + +The VMs in which we run the selftests do not have llvm installed. +We build selftests/bpftool in a host that have llvm. +bpftool currently will use llvm first and fallback to libbfd but there +is no way to disable detection from the command line. + +Removing it from the feature detection should force us to use libbfd. + +Signed-off-by: Manu Bretelle +--- + tools/bpf/bpftool/Makefile | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile +index e9154ace80ff..01314458e25e 100644 +--- a/tools/bpf/bpftool/Makefile ++++ b/tools/bpf/bpftool/Makefile +@@ -95,7 +95,6 @@ RM ?= rm -f + FEATURE_USER = .bpftool + + FEATURE_TESTS := clang-bpf-co-re +-FEATURE_TESTS += llvm + FEATURE_TESTS += libcap + FEATURE_TESTS += libbfd + FEATURE_TESTS += libbfd-liberty +@@ -104,7 +103,6 @@ FEATURE_TESTS += disassembler-four-args + FEATURE_TESTS += disassembler-init-styled + + FEATURE_DISPLAY := clang-bpf-co-re +-FEATURE_DISPLAY += llvm + FEATURE_DISPLAY += libcap + FEATURE_DISPLAY += libbfd + FEATURE_DISPLAY += libbfd-liberty +-- +2.39.3 + diff --git a/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch new file mode 100644 index 000000000000..3fa007c51db6 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch @@ -0,0 +1,32 @@ +From 0daad0a615e687e1247230f3d0c31ae60ba32314 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Tue, 28 May 2024 15:29:38 -0700 +Subject: [PATCH bpf-next] selftests/bpf: fix inet_csk_accept prototype in + test_sk_storage_tracing.c + +Recent kernel change ([0]) changed inet_csk_accept() prototype. Adapt +progs/test_sk_storage_tracing.c to take that into account. + + [0] 92ef0fd55ac8 ("net: change proto and proto_ops accept type") + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +index 02e718f06e0f..40531e56776e 100644 +--- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c ++++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +@@ -84,7 +84,7 @@ int BPF_PROG(trace_tcp_connect, struct sock *sk) + } + + SEC("fexit/inet_csk_accept") +-int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, ++int BPF_PROG(inet_csk_accept, struct sock *sk, struct proto_accept_arg *arg, + struct sock *accepted_sk) + { + set_task_info(accepted_sk); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch new file mode 100644 index 000000000000..ec1e29a8ab97 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch @@ -0,0 +1,31 @@ +From d31a7125891994681503770cff46a119692fb2b9 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Mon, 11 Dec 2023 17:09:38 -0800 +Subject: [PATCH 1/1] selftests/bpf: work around latest Clang smartness + +Work around the issue while we deal with it in the Clang itself. +See [0]. + + [0] https://github.com/llvm/llvm-project/pull/73662#issuecomment-1849281758 + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/iters.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c +index 3aca3dc145b5..929ba6fa2105 100644 +--- a/tools/testing/selftests/bpf/progs/iters.c ++++ b/tools/testing/selftests/bpf/progs/iters.c +@@ -1420,7 +1420,7 @@ SEC("raw_tp") + __success + int iter_arr_with_actual_elem_count(const void *ctx) + { +- int i, n = loop_data.n, sum = 0; ++ unsigned i, n = loop_data.n, sum = 0; + + if (n > ARRAY_SIZE(loop_data.data)) + return 0; +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch new file mode 100644 index 000000000000..e631fac0cc69 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch @@ -0,0 +1,89 @@ +From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 +From: Anders Roxell +Date: Thu, 9 Nov 2023 18:43:28 +0100 +Subject: [PATCH] selftests: bpf: xskxceiver: ksft_print_msg: fix format type + error + +Crossbuilding selftests/bpf for architecture arm64, format specifies +type error show up like. + +xskxceiver.c:912:34: error: format specifies type 'int' but the argument +has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", + ~~ + %llu + __func__, pkt->pkt_nb, meta->count); + ^~~~~~~~~~~ +xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but + the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] + ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ~~~~ ^~~~ + +Fixing the issues by casting to (unsigned long long) and changing the +specifiers to be %llu from %d and %u, since with u64s it might be %llx +or %lx, depending on architecture. + +Signed-off-by: Anders Roxell +Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- + 1 file changed, 12 insertions(+), 7 deletions(-) + +diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c +index 591ca9637b23..b604c570309a 100644 +--- a/tools/testing/selftests/bpf/xskxceiver.c ++++ b/tools/testing/selftests/bpf/xskxceiver.c +@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { +- ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", +- __func__, pkt->pkt_nb, meta->count); ++ ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", ++ __func__, pkt->pkt_nb, ++ (unsigned long long)meta->count); + return false; + } + +@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { +- ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag invalid addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { +- ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + +@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); +- ksft_print_msg("Last completion address: %llx\n", addr); ++ ksft_print_msg("Last completion address: %llx\n", ++ (unsigned long long)addr); + return TEST_FAILURE; + } + +@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { +- ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", +- __func__, stats.tx_invalid_descs, ++ ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", ++ __func__, ++ (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } +-- +2.34.1 + diff --git a/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 000000000000..19d269de7e8c --- /dev/null +++ b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,142 @@ +From 3772e6cdb51f21a11df2acf6aa431cc8b9137bfb Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:09 +0100 +Subject: [PATCH 1/2] tools/resolve_btfids: Refactor set sorting with types + from btf_ids.h + +Instead of using magic offsets to access BTF ID set data, leverage types +from btf_ids.h (btf_id_set and btf_id_set8) which define the actual +layout of the data. Thanks to this change, set sorting should also +continue working if the layout changes. + +This requires to sync the definition of 'struct btf_id_set8' from +include/linux/btf_ids.h to tools/include/linux/btf_ids.h. We don't sync +the rest of the file at the moment, b/c that would require to also sync +multiple dependent headers and we don't need any other defs from +btf_ids.h. + +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Acked-by: Daniel Xu +Link: https://lore.kernel.org/bpf/ff7f062ddf6a00815fda3087957c4ce667f50532.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 ++++++++++++++++++++------------- + tools/include/linux/btf_ids.h | 9 +++++++++ + 2 files changed, 30 insertions(+), 14 deletions(-) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 27a23196d58e..32634f00abba 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -70,6 +70,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -78,7 +79,7 @@ + #include + + #define BTF_IDS_SECTION ".BTF_ids" +-#define BTF_ID "__BTF_ID__" ++#define BTF_ID_PREFIX "__BTF_ID__" + + #define BTF_STRUCT "struct" + #define BTF_UNION "union" +@@ -161,7 +162,7 @@ static int eprintf(int level, int var, const char *fmt, ...) + + static bool is_btf_id(const char *name) + { +- return name && !strncmp(name, BTF_ID, sizeof(BTF_ID) - 1); ++ return name && !strncmp(name, BTF_ID_PREFIX, sizeof(BTF_ID_PREFIX) - 1); + } + + static struct btf_id *btf_id__find(struct rb_root *root, const char *name) +@@ -441,7 +442,7 @@ static int symbols_collect(struct object *obj) + * __BTF_ID__TYPE__vfs_truncate__0 + * prefix = ^ + */ +- prefix = name + sizeof(BTF_ID) - 1; ++ prefix = name + sizeof(BTF_ID_PREFIX) - 1; + + /* struct */ + if (!strncmp(prefix, BTF_STRUCT, sizeof(BTF_STRUCT) - 1)) { +@@ -649,19 +650,18 @@ static int cmp_id(const void *pa, const void *pb) + static int sets_patch(struct object *obj) + { + Elf_Data *data = obj->efile.idlist; +- int *ptr = data->d_buf; + struct rb_node *next; + + next = rb_first(&obj->sets); + while (next) { +- unsigned long addr, idx; ++ struct btf_id_set8 *set8; ++ struct btf_id_set *set; ++ unsigned long addr, off; + struct btf_id *id; +- int *base; +- int cnt; + + id = rb_entry(next, struct btf_id, rb_node); + addr = id->addr[0]; +- idx = addr - obj->efile.idlist_addr; ++ off = addr - obj->efile.idlist_addr; + + /* sets are unique */ + if (id->addr_cnt != 1) { +@@ -670,14 +670,21 @@ static int sets_patch(struct object *obj) + return -1; + } + +- idx = idx / sizeof(int); +- base = &ptr[idx] + (id->is_set8 ? 2 : 1); +- cnt = ptr[idx]; ++ if (id->is_set) { ++ set = data->d_buf + off; ++ qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); ++ } else { ++ set8 = data->d_buf + off; ++ /* ++ * Make sure id is at the beginning of the pairs ++ * struct, otherwise the below qsort would not work. ++ */ ++ BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); ++ qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +- (idx + 1) * sizeof(int), cnt, id->name); +- +- qsort(base, cnt, id->is_set8 ? sizeof(uint64_t) : sizeof(int), cmp_id); ++ off, id->is_set ? set->cnt : set8->cnt, id->name); + + next = rb_next(next); + } +diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h +index 2f882d5cb30f..72535f00572f 100644 +--- a/tools/include/linux/btf_ids.h ++++ b/tools/include/linux/btf_ids.h +@@ -8,6 +8,15 @@ struct btf_id_set { + u32 ids[]; + }; + ++struct btf_id_set8 { ++ u32 cnt; ++ u32 flags; ++ struct { ++ u32 id; ++ u32 flags; ++ } pairs[]; ++}; ++ + #ifdef CONFIG_DEBUG_INFO_BTF + + #include /* for __PASTE */ +-- +2.39.3 + + + diff --git a/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch new file mode 100644 index 000000000000..24ebc231056c --- /dev/null +++ b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch @@ -0,0 +1,65 @@ +From 08969a676d234a178ff9f8c67936a2ad98a741eb Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 27 Oct 2023 16:22:24 -0700 +Subject: [PATCH] tracing/kprobes: Fix symbol counting logic by looking at + modules as well + +Recent changes to count number of matching symbols when creating +a kprobe event failed to take into account kernel modules. As such, it +breaks kprobes on kernel module symbols, by assuming there is no match. + +Fix this my calling module_kallsyms_on_each_symbol() in addition to +kallsyms_on_each_match_symbol() to perform a proper counting. + +Cc: Francis Laniel +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Steven Rostedt +Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols") +Signed-off-by: Andrii Nakryiko +--- + kernel/trace/trace_kprobe.c | 24 ++++++++++++++++++++---- + 1 file changed, 20 insertions(+), 4 deletions(-) + +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index effcaede4759..1efb27f35963 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused) + return 0; + } + ++struct sym_count_ctx { ++ unsigned int count; ++ const char *name; ++}; ++ ++static int count_mod_symbols(void *data, const char *name, unsigned long unused) ++{ ++ struct sym_count_ctx *ctx = data; ++ ++ if (strcmp(name, ctx->name) == 0) ++ ctx->count++; ++ ++ return 0; ++} ++ + static unsigned int number_of_same_symbols(char *func_name) + { +- unsigned int count; ++ struct sym_count_ctx ctx = { .count = 0, .name = func_name }; ++ ++ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + +- count = 0; +- kallsyms_on_each_match_symbol(count_symbols, func_name, &count); ++ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + +- return count; ++ return ctx.count; + } + + static int __trace_kprobe_create(int argc, const char *argv[]) +-- +2.34.1 + diff --git a/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 000000000000..c4d67693bd13 --- /dev/null +++ b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,117 @@ +From c3dcadfdf2bf8f01471066700c098b5185240df6 Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:10 +0100 +Subject: [PATCH 2/2] tools/resolve_btfids: Fix cross-compilation to non-host + endianness + +The .BTF_ids section is pre-filled with zeroed BTF ID entries during the +build and afterwards patched by resolve_btfids with correct values. +Since resolve_btfids always writes in host-native endianness, it relies +on libelf to do the translation when the target ELF is cross-compiled to +a different endianness (this was introduced in commit 61e8aeda9398 +("bpf: Fix libelf endian handling in resolv_btfids")). + +Unfortunately, the translation will corrupt the flags fields of SET8 +entries because these were written during vmlinux compilation and are in +the correct endianness already. This will lead to numerous selftests +failures such as: + + $ sudo ./test_verifier 502 502 + #502/p sleepable fentry accept FAIL + Failed to load prog 'Invalid argument'! + bpf_fentry_test1 is not sleepable + verification time 34 usec + stack depth 0 + processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 + Summary: 0 PASSED, 0 SKIPPED, 1 FAILED + +Since it's not possible to instruct libelf to translate just certain +values, let's manually bswap the flags (both global and entry flags) in +resolve_btfids when needed, so that libelf then translates everything +correctly. + +Fixes: ef2c6f370a63 ("tools/resolve_btfids: Add support for 8-byte BTF sets") +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/7b6bff690919555574ce0f13d2a5996cacf7bf69.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 +++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 32634f00abba..d9520cb826b3 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -90,6 +90,14 @@ + + #define ADDR_CNT 100 + ++#if __BYTE_ORDER == __LITTLE_ENDIAN ++# define ELFDATANATIVE ELFDATA2LSB ++#elif __BYTE_ORDER == __BIG_ENDIAN ++# define ELFDATANATIVE ELFDATA2MSB ++#else ++# error "Unknown machine endianness!" ++#endif ++ + struct btf_id { + struct rb_node rb_node; + char *name; +@@ -117,6 +125,7 @@ struct object { + int idlist_shndx; + size_t strtabidx; + unsigned long idlist_addr; ++ int encoding; + } efile; + + struct rb_root sets; +@@ -320,6 +329,7 @@ static int elf_collect(struct object *obj) + { + Elf_Scn *scn = NULL; + size_t shdrstrndx; ++ GElf_Ehdr ehdr; + int idx = 0; + Elf *elf; + int fd; +@@ -351,6 +361,13 @@ static int elf_collect(struct object *obj) + return -1; + } + ++ if (gelf_getehdr(obj->efile.elf, &ehdr) == NULL) { ++ pr_err("FAILED cannot get ELF header: %s\n", ++ elf_errmsg(-1)); ++ return -1; ++ } ++ obj->efile.encoding = ehdr.e_ident[EI_DATA]; ++ + /* + * Scan all the elf sections and look for save data + * from .BTF_ids section and symbols. +@@ -681,6 +698,24 @@ static int sets_patch(struct object *obj) + */ + BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ ++ /* ++ * When ELF endianness does not match endianness of the ++ * host, libelf will do the translation when updating ++ * the ELF. This, however, corrupts SET8 flags which are ++ * already in the target endianness. So, let's bswap ++ * them to the host endianness and libelf will then ++ * correctly translate everything. ++ */ ++ if (obj->efile.encoding != ELFDATANATIVE) { ++ int i; ++ ++ set8->flags = bswap_32(set8->flags); ++ for (i = 0; i < set8->cnt; i++) { ++ set8->pairs[i].flags = ++ bswap_32(set8->pairs[i].flags); ++ } ++ } + } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +-- +2.39.3 + diff --git a/ci/diffs/0099-s390x_nolockdep.diff b/ci/diffs/0099-s390x_nolockdep.diff new file mode 100644 index 000000000000..44c2d1a52065 --- /dev/null +++ b/ci/diffs/0099-s390x_nolockdep.diff @@ -0,0 +1,48 @@ +From 470d0c7874ac638ea62cddc3a20ec047fa4ab539 Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Wed, 14 Feb 2024 17:25:35 -0800 +Subject: [PATCH] bpf/selftests: disable lockdep on s390x + +Tests are slow to run on s390x, this should make them faster. + +Signed-off-by: Manu Bretelle +--- + tools/testing/selftests/bpf/config.s390x | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x +index 706931a8c2c69..67bfd62b0b582 100644 +--- a/tools/testing/selftests/bpf/config.s390x ++++ b/tools/testing/selftests/bpf/config.s390x +@@ -23,11 +23,11 @@ CONFIG_CPUSETS=y + CONFIG_CRASH_DUMP=y + CONFIG_CRYPTO_USER_API_RNG=y + CONFIG_CRYPTO_USER_API_SKCIPHER=y +-CONFIG_DEBUG_ATOMIC_SLEEP=y ++CONFIG_DEBUG_ATOMIC_SLEEP=n + CONFIG_DEBUG_INFO_BTF=y + CONFIG_DEBUG_INFO_DWARF4=y + CONFIG_DEBUG_LIST=y +-CONFIG_DEBUG_LOCKDEP=y ++CONFIG_DEBUG_LOCKDEP=n + CONFIG_DEBUG_NOTIFIERS=y + CONFIG_DEBUG_PAGEALLOC=y + CONFIG_DEBUG_SECTION_MISMATCH=y +@@ -71,7 +71,7 @@ CONFIG_KRETPROBES=y + CONFIG_KSM=y + CONFIG_LATENCYTOP=y + CONFIG_LIVEPATCH=y +-CONFIG_LOCK_STAT=y ++CONFIG_LOCK_STAT=n + CONFIG_MACVLAN=y + CONFIG_MACVTAP=y + CONFIG_MAGIC_SYSRQ=y +@@ -101,7 +101,7 @@ CONFIG_PCI=y + CONFIG_POSIX_MQUEUE=y + CONFIG_PROC_KCORE=y + CONFIG_PROFILING=y +-CONFIG_PROVE_LOCKING=y ++CONFIG_PROVE_LOCKING=n + CONFIG_PTDUMP_DEBUGFS=y + CONFIG_RC_DEVICES=y + CONFIG_RC_LOOPBACK=y diff --git a/ci/diffs/0099-selftest-cross-compile.diff b/ci/diffs/0099-selftest-cross-compile.diff new file mode 100644 index 000000000000..e8732596bdb3 --- /dev/null +++ b/ci/diffs/0099-selftest-cross-compile.diff @@ -0,0 +1,13 @@ +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index a38a3001527c..af68528cc944 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -304,7 +304,7 @@ $(OUTPUT)/test_maps: $(TESTING_HELPERS) + $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) + $(OUTPUT)/xsk.o: $(BPFOBJ) + +-BPFTOOL ?= $(DEFAULT_BPFTOOL) ++BPFTOOL ?= $(TRUNNER_BPFTOOL) + $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ diff --git a/ci/diffs/0199-Revert-Merge-branch-netfs-writeback-of-ssh-gitolite..patch b/ci/diffs/0199-Revert-Merge-branch-netfs-writeback-of-ssh-gitolite..patch new file mode 100644 index 000000000000..6946dd02ecbe --- /dev/null +++ b/ci/diffs/0199-Revert-Merge-branch-netfs-writeback-of-ssh-gitolite..patch @@ -0,0 +1,7817 @@ +From e87dca596d357b694c929da2c1a9ee374ab02e66 Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Mon, 23 Sep 2024 12:04:48 -0700 +Subject: [PATCH] Revert "Merge branch 'netfs-writeback' of + ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs into + vfs.netfs" + +This reverts commit 3956e7284c41629eb8f1e7104f1e73332bd1ce97, reversing +changes made to 4356ab331c8f0dbed0f683abde345cd5503db1e4. +--- + fs/9p/vfs_addr.c | 11 +- + fs/afs/file.c | 30 +- + fs/afs/fsclient.c | 9 +- + fs/afs/write.c | 4 +- + fs/afs/yfsclient.c | 9 +- + fs/cachefiles/io.c | 19 +- + fs/cachefiles/xattr.c | 34 +- + fs/ceph/addr.c | 76 ++-- + fs/netfs/Makefile | 4 +- + fs/netfs/buffered_read.c | 766 +++++++++++++-------------------- + fs/netfs/buffered_write.c | 309 +++++++------- + fs/netfs/direct_read.c | 147 +------ + fs/netfs/internal.h | 43 +- + fs/netfs/io.c | 804 +++++++++++++++++++++++++++++++++++ + fs/netfs/iterator.c | 50 --- + fs/netfs/main.c | 7 +- + fs/netfs/misc.c | 94 ---- + fs/netfs/objects.c | 16 +- + fs/netfs/read_collect.c | 544 ------------------------ + fs/netfs/read_pgpriv2.c | 264 ------------ + fs/netfs/read_retry.c | 256 ----------- + fs/netfs/stats.c | 27 +- + fs/netfs/write_collect.c | 246 +++++++---- + fs/netfs/write_issue.c | 93 ++-- + fs/nfs/fscache.c | 19 +- + fs/nfs/fscache.h | 7 +- + fs/smb/client/cifsencrypt.c | 144 ++++++- + fs/smb/client/cifsglob.h | 4 +- + fs/smb/client/cifssmb.c | 7 +- + fs/smb/client/file.c | 96 +++-- + fs/smb/client/smb2ops.c | 219 +++++----- + fs/smb/client/smb2pdu.c | 25 +- + fs/smb/client/smbdirect.c | 82 ++-- + include/linux/folio_queue.h | 156 ------- + include/linux/iov_iter.h | 104 ----- + include/linux/netfs.h | 46 +- + include/linux/uio.h | 18 - + include/trace/events/netfs.h | 144 ++----- + lib/iov_iter.c | 240 +---------- + lib/kunit_iov_iter.c | 259 ----------- + lib/scatterlist.c | 69 +-- + 41 files changed, 1983 insertions(+), 3518 deletions(-) + create mode 100644 fs/netfs/io.c + delete mode 100644 fs/netfs/read_collect.c + delete mode 100644 fs/netfs/read_pgpriv2.c + delete mode 100644 fs/netfs/read_retry.c + delete mode 100644 include/linux/folio_queue.h + +diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c +index 819c75233235..24fdc74caeba 100644 +--- a/fs/9p/vfs_addr.c ++++ b/fs/9p/vfs_addr.c +@@ -68,22 +68,17 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) + { + struct netfs_io_request *rreq = subreq->rreq; + struct p9_fid *fid = rreq->netfs_priv; +- unsigned long long pos = subreq->start + subreq->transferred; + int total, err; + +- total = p9_client_read(fid, pos, &subreq->io_iter, &err); ++ total = p9_client_read(fid, subreq->start + subreq->transferred, ++ &subreq->io_iter, &err); + + /* if we just extended the file size, any portion not in + * cache won't be on server and is zeroes */ + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); +- if (pos + total >= i_size_read(rreq->inode)) +- __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); + +- if (!err) +- subreq->transferred += total; +- +- netfs_read_subreq_terminated(subreq, err, false); ++ netfs_subreq_terminated(subreq, err ?: total, false); + } + + /** +diff --git a/fs/afs/file.c b/fs/afs/file.c +index 492d857a3fa0..ec1be0091fdb 100644 +--- a/fs/afs/file.c ++++ b/fs/afs/file.c +@@ -16,7 +16,6 @@ + #include + #include + #include +-#include + #include "internal.h" + + static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); +@@ -243,10 +242,9 @@ static void afs_fetch_data_notify(struct afs_operation *op) + + req->error = error; + if (subreq) { +- subreq->rreq->i_size = req->file_size; +- if (req->pos + req->actual_len >= req->file_size) +- __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); +- netfs_read_subreq_terminated(subreq, error, false); ++ if (subreq->rreq->origin != NETFS_DIO_READ) ++ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); ++ netfs_subreq_terminated(subreq, error ?: req->actual_len, false); + req->subreq = NULL; + } else if (req->done) { + req->done(req); +@@ -264,12 +262,6 @@ static void afs_fetch_data_success(struct afs_operation *op) + afs_fetch_data_notify(op); + } + +-static void afs_fetch_data_aborted(struct afs_operation *op) +-{ +- afs_check_for_remote_deletion(op); +- afs_fetch_data_notify(op); +-} +- + static void afs_fetch_data_put(struct afs_operation *op) + { + op->fetch.req->error = afs_op_error(op); +@@ -280,7 +272,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = { + .issue_afs_rpc = afs_fs_fetch_data, + .issue_yfs_rpc = yfs_fs_fetch_data, + .success = afs_fetch_data_success, +- .aborted = afs_fetch_data_aborted, ++ .aborted = afs_check_for_remote_deletion, + .failed = afs_fetch_data_notify, + .put = afs_fetch_data_put, + }; +@@ -302,7 +294,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) + op = afs_alloc_operation(req->key, vnode->volume); + if (IS_ERR(op)) { + if (req->subreq) +- netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false); ++ netfs_subreq_terminated(req->subreq, PTR_ERR(op), false); + return PTR_ERR(op); + } + +@@ -313,15 +305,14 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) + return afs_do_sync_operation(op); + } + +-static void afs_read_worker(struct work_struct *work) ++static void afs_issue_read(struct netfs_io_subrequest *subreq) + { +- struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + struct afs_read *fsreq; + + fsreq = afs_alloc_read(GFP_NOFS); + if (!fsreq) +- return netfs_read_subreq_terminated(subreq, -ENOMEM, false); ++ return netfs_subreq_terminated(subreq, -ENOMEM, false); + + fsreq->subreq = subreq; + fsreq->pos = subreq->start + subreq->transferred; +@@ -330,17 +321,10 @@ static void afs_read_worker(struct work_struct *work) + fsreq->vnode = vnode; + fsreq->iter = &subreq->io_iter; + +- trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + afs_fetch_data(fsreq->vnode, fsreq); + afs_put_read(fsreq); + } + +-static void afs_issue_read(struct netfs_io_subrequest *subreq) +-{ +- INIT_WORK(&subreq->work, afs_read_worker); +- queue_work(system_long_wq, &subreq->work); +-} +- + static int afs_symlink_read_folio(struct file *file, struct folio *folio) + { + struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host); +diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c +index 098fa034a1cc..79cd30775b7a 100644 +--- a/fs/afs/fsclient.c ++++ b/fs/afs/fsclient.c +@@ -304,7 +304,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) + struct afs_vnode_param *vp = &op->file[0]; + struct afs_read *req = op->fetch.req; + const __be32 *bp; +- size_t count_before; + int ret; + + _enter("{%u,%zu,%zu/%llu}", +@@ -346,14 +345,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) + + /* extract the returned data */ + case 2: +- count_before = call->iov_len; +- _debug("extract data %zu/%llu", count_before, req->actual_len); ++ _debug("extract data %zu/%llu", ++ iov_iter_count(call->iter), req->actual_len); + + ret = afs_extract_data(call, true); +- if (req->subreq) { +- req->subreq->transferred += count_before - call->iov_len; +- netfs_read_subreq_progress(req->subreq, false); +- } + if (ret < 0) + return ret; + +diff --git a/fs/afs/write.c b/fs/afs/write.c +index 34107b55f834..e959640694c2 100644 +--- a/fs/afs/write.c ++++ b/fs/afs/write.c +@@ -89,12 +89,10 @@ static const struct afs_operation_ops afs_store_data_operation = { + */ + void afs_prepare_write(struct netfs_io_subrequest *subreq) + { +- struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr]; +- + //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) + // subreq->max_len = 512 * 1024; + //else +- stream->sreq_max_len = 256 * 1024 * 1024; ++ subreq->max_len = 256 * 1024 * 1024; + } + + /* +diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c +index 024227aba4cd..f521e66d3bf6 100644 +--- a/fs/afs/yfsclient.c ++++ b/fs/afs/yfsclient.c +@@ -355,7 +355,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) + struct afs_vnode_param *vp = &op->file[0]; + struct afs_read *req = op->fetch.req; + const __be32 *bp; +- size_t count_before; + int ret; + + _enter("{%u,%zu, %zu/%llu}", +@@ -392,14 +391,10 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) + + /* extract the returned data */ + case 2: +- count_before = call->iov_len; +- _debug("extract data %zu/%llu", count_before, req->actual_len); ++ _debug("extract data %zu/%llu", ++ iov_iter_count(call->iter), req->actual_len); + + ret = afs_extract_data(call, true); +- if (req->subreq) { +- req->subreq->transferred += count_before - call->iov_len; +- netfs_read_subreq_progress(req->subreq, false); +- } + if (ret < 0) + return ret; + +diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c +index 6a821a959b59..a91acd03ee12 100644 +--- a/fs/cachefiles/io.c ++++ b/fs/cachefiles/io.c +@@ -627,12 +627,11 @@ static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq) + { + struct netfs_io_request *wreq = subreq->rreq; + struct netfs_cache_resources *cres = &wreq->cache_resources; +- struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; + + _enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start); + +- stream->sreq_max_len = MAX_RW_COUNT; +- stream->sreq_max_segs = BIO_MAX_VECS; ++ subreq->max_len = MAX_RW_COUNT; ++ subreq->max_nr_segs = BIO_MAX_VECS; + + if (!cachefiles_cres_file(cres)) { + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) +@@ -648,7 +647,6 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) + struct netfs_cache_resources *cres = &wreq->cache_resources; + struct cachefiles_object *object = cachefiles_cres_object(cres); + struct cachefiles_cache *cache = object->volume->cache; +- struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; + const struct cred *saved_cred; + size_t off, pre, post, len = subreq->len; + loff_t start = subreq->start; +@@ -662,7 +660,6 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) + if (off) { + pre = CACHEFILES_DIO_BLOCK_SIZE - off; + if (pre >= len) { +- fscache_count_dio_misfit(); + netfs_write_subrequest_terminated(subreq, len, false); + return; + } +@@ -673,22 +670,10 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) + } + + /* We also need to end on the cache granularity boundary */ +- if (start + len == wreq->i_size) { +- size_t part = len % CACHEFILES_DIO_BLOCK_SIZE; +- size_t need = CACHEFILES_DIO_BLOCK_SIZE - part; +- +- if (part && stream->submit_extendable_to >= need) { +- len += need; +- subreq->len += need; +- subreq->io_iter.count += need; +- } +- } +- + post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1); + if (post) { + len -= post; + if (len == 0) { +- fscache_count_dio_misfit(); + netfs_write_subrequest_terminated(subreq, post, false); + return; + } +diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c +index 7c6f260a3be5..4dd8a993c60a 100644 +--- a/fs/cachefiles/xattr.c ++++ b/fs/cachefiles/xattr.c +@@ -64,15 +64,9 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) + memcpy(buf->data, fscache_get_aux(object->cookie), len); + + ret = cachefiles_inject_write_error(); +- if (ret == 0) { +- ret = mnt_want_write_file(file); +- if (ret == 0) { +- ret = vfs_setxattr(&nop_mnt_idmap, dentry, +- cachefiles_xattr_cache, buf, +- sizeof(struct cachefiles_xattr) + len, 0); +- mnt_drop_write_file(file); +- } +- } ++ if (ret == 0) ++ ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, ++ buf, sizeof(struct cachefiles_xattr) + len, 0); + if (ret < 0) { + trace_cachefiles_vfs_error(object, file_inode(file), ret, + cachefiles_trace_setxattr_error); +@@ -157,14 +151,8 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + int ret; + + ret = cachefiles_inject_remove_error(); +- if (ret == 0) { +- ret = mnt_want_write(cache->mnt); +- if (ret == 0) { +- ret = vfs_removexattr(&nop_mnt_idmap, dentry, +- cachefiles_xattr_cache); +- mnt_drop_write(cache->mnt); +- } +- } ++ if (ret == 0) ++ ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache); + if (ret < 0) { + trace_cachefiles_vfs_error(object, d_inode(dentry), ret, + cachefiles_trace_remxattr_error); +@@ -220,15 +208,9 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) + memcpy(buf->data, p, volume->vcookie->coherency_len); + + ret = cachefiles_inject_write_error(); +- if (ret == 0) { +- ret = mnt_want_write(volume->cache->mnt); +- if (ret == 0) { +- ret = vfs_setxattr(&nop_mnt_idmap, dentry, +- cachefiles_xattr_cache, +- buf, len, 0); +- mnt_drop_write(volume->cache->mnt); +- } +- } ++ if (ret == 0) ++ ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, ++ buf, len, 0); + if (ret < 0) { + trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, + cachefiles_trace_setxattr_error); +diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c +index 5d9ccda098cc..a5f848c167fa 100644 +--- a/fs/ceph/addr.c ++++ b/fs/ceph/addr.c +@@ -13,7 +13,6 @@ + #include + #include + #include +-#include + + #include "super.h" + #include "mds_client.h" +@@ -206,6 +205,21 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) + } + } + ++static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) ++{ ++ struct inode *inode = subreq->rreq->inode; ++ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); ++ struct ceph_inode_info *ci = ceph_inode(inode); ++ u64 objno, objoff; ++ u32 xlen; ++ ++ /* Truncate the extent at the end of the current block */ ++ ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, ++ &objno, &objoff, &xlen); ++ subreq->len = min(xlen, fsc->mount_options->rsize); ++ return true; ++} ++ + static void finish_netfs_read(struct ceph_osd_request *req) + { + struct inode *inode = req->r_inode; +@@ -250,12 +264,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) + calc_pages_for(osd_data->alignment, + osd_data->length), false); + } +- if (err > 0) { +- subreq->transferred = err; +- err = 0; +- } +- trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); +- netfs_read_subreq_terminated(subreq, err, false); ++ netfs_subreq_terminated(subreq, err, false); + iput(req->r_inode); + ceph_dec_osd_stopping_blocker(fsc->mdsc); + } +@@ -269,6 +278,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); ++ struct iov_iter iter; + ssize_t err = 0; + size_t len; + int mode; +@@ -291,7 +301,6 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); + req->r_num_caps = 2; + +- trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto out; +@@ -305,36 +314,17 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) + } + + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); +- err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter); +- if (err == 0) { ++ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); ++ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); ++ if (err == 0) + err = -EFAULT; +- } else { +- subreq->transferred += err; +- err = 0; +- } + + ceph_mdsc_put_request(req); + out: +- netfs_read_subreq_terminated(subreq, err, false); ++ netfs_subreq_terminated(subreq, err, false); + return true; + } + +-static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq) +-{ +- struct netfs_io_request *rreq = subreq->rreq; +- struct inode *inode = rreq->inode; +- struct ceph_inode_info *ci = ceph_inode(inode); +- struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); +- u64 objno, objoff; +- u32 xlen; +- +- /* Truncate the extent at the end of the current block */ +- ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, +- &objno, &objoff, &xlen); +- rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize); +- return 0; +-} +- + static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) + { + struct netfs_io_request *rreq = subreq->rreq; +@@ -344,8 +334,9 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) + struct ceph_client *cl = fsc->client; + struct ceph_osd_request *req = NULL; + struct ceph_vino vino = ceph_vino(inode); +- int err; +- u64 len; ++ struct iov_iter iter; ++ int err = 0; ++ u64 len = subreq->len; + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 off = subreq->start; + int extent_cnt; +@@ -358,12 +349,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) + return; + +- // TODO: This rounding here is slightly dodgy. It *should* work, for +- // now, as the cache only deals in blocks that are a multiple of +- // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to +- // happen is for the fscrypt driving to be moved into netfslib and the +- // data in the cache also to be stored encrypted. +- len = subreq->len; + ceph_fscrypt_adjust_off_and_len(inode, &off, &len); + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, +@@ -386,6 +371,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) + doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", + ceph_vinop(inode), subreq->start, subreq->len, len); + ++ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); ++ + /* + * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for + * encrypted inodes. We'd need infrastructure that handles an iov_iter +@@ -397,7 +384,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) + struct page **pages; + size_t page_off; + +- err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off); ++ err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + if (err < 0) { + doutc(cl, "%llx.%llx failed to allocate pages, %d\n", + ceph_vinop(inode), err); +@@ -412,7 +399,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, + false); + } else { +- osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter); ++ osd_req_op_extent_osd_iter(req, 0, &iter); + } + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + err = -EIO; +@@ -423,19 +410,17 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) + req->r_inode = inode; + ihold(inode); + +- trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + ceph_osdc_start_request(req->r_osdc, req); + out: + ceph_osdc_put_request(req); + if (err) +- netfs_read_subreq_terminated(subreq, err, false); ++ netfs_subreq_terminated(subreq, err, false); + doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); + } + + static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) + { + struct inode *inode = rreq->inode; +- struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); + int got = 0, want = CEPH_CAP_FILE_CACHE; + struct ceph_netfs_request_data *priv; +@@ -487,7 +472,6 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) + + priv->caps = got; + rreq->netfs_priv = priv; +- rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize; + + out: + if (ret < 0) +@@ -512,9 +496,9 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) + const struct netfs_request_ops ceph_netfs_ops = { + .init_request = ceph_init_request, + .free_request = ceph_netfs_free_request, +- .prepare_read = ceph_netfs_prepare_read, + .issue_read = ceph_netfs_issue_read, + .expand_readahead = ceph_netfs_expand_readahead, ++ .clamp_length = ceph_netfs_clamp_length, + .check_write_begin = ceph_netfs_check_write_begin, + }; + +diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile +index d08b0bfb6756..8e6781e0b10b 100644 +--- a/fs/netfs/Makefile ++++ b/fs/netfs/Makefile +@@ -5,14 +5,12 @@ netfs-y := \ + buffered_write.o \ + direct_read.o \ + direct_write.o \ ++ io.o \ + iterator.o \ + locking.o \ + main.o \ + misc.o \ + objects.o \ +- read_collect.o \ +- read_pgpriv2.o \ +- read_retry.o \ + write_collect.o \ + write_issue.o + +diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c +index c40e226053cc..27c750d39476 100644 +--- a/fs/netfs/buffered_read.c ++++ b/fs/netfs/buffered_read.c +@@ -9,388 +9,266 @@ + #include + #include "internal.h" + +-static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, +- unsigned long long *_start, +- unsigned long long *_len, +- unsigned long long i_size) ++/* ++ * [DEPRECATED] Unlock the folios in a read operation for when the filesystem ++ * is using PG_private_2 and direct writing to the cache from here rather than ++ * marking the page for writeback. ++ * ++ * Note that we don't touch folio->private in this code. ++ */ ++static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq, ++ size_t *account) + { +- struct netfs_cache_resources *cres = &rreq->cache_resources; ++ struct netfs_io_subrequest *subreq; ++ struct folio *folio; ++ pgoff_t start_page = rreq->start / PAGE_SIZE; ++ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; ++ bool subreq_failed = false; + +- if (cres->ops && cres->ops->expand_readahead) +- cres->ops->expand_readahead(cres, _start, _len, i_size); +-} ++ XA_STATE(xas, &rreq->mapping->i_pages, start_page); + +-static void netfs_rreq_expand(struct netfs_io_request *rreq, +- struct readahead_control *ractl) +-{ +- /* Give the cache a chance to change the request parameters. The +- * resultant request must contain the original region. ++ /* Walk through the pagecache and the I/O request lists simultaneously. ++ * We may have a mixture of cached and uncached sections and we only ++ * really want to write out the uncached sections. This is slightly ++ * complicated by the possibility that we might have huge pages with a ++ * mixture inside. + */ +- netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); ++ subreq = list_first_entry(&rreq->subrequests, ++ struct netfs_io_subrequest, rreq_link); ++ subreq_failed = (subreq->error < 0); + +- /* Give the netfs a chance to change the request parameters. The +- * resultant request must contain the original region. +- */ +- if (rreq->netfs_ops->expand_readahead) +- rreq->netfs_ops->expand_readahead(rreq); ++ trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2); + +- /* Expand the request if the cache wants it to start earlier. Note +- * that the expansion may get further extended if the VM wishes to +- * insert THPs and the preferred start and/or end wind up in the middle +- * of THPs. +- * +- * If this is the case, however, the THP size should be an integer +- * multiple of the cache granule size, so we get a whole number of +- * granules to deal with. +- */ +- if (rreq->start != readahead_pos(ractl) || +- rreq->len != readahead_length(ractl)) { +- readahead_expand(ractl, rreq->start, rreq->len); +- rreq->start = readahead_pos(ractl); +- rreq->len = readahead_length(ractl); ++ rcu_read_lock(); ++ xas_for_each(&xas, folio, last_page) { ++ loff_t pg_end; ++ bool pg_failed = false; ++ bool folio_started = false; + +- trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), +- netfs_read_trace_expanded); +- } +-} ++ if (xas_retry(&xas, folio)) ++ continue; + +-/* +- * Begin an operation, and fetch the stored zero point value from the cookie if +- * available. +- */ +-static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) +-{ +- return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); +-} ++ pg_end = folio_pos(folio) + folio_size(folio) - 1; + +-/* +- * Decant the list of folios to read into a rolling buffer. +- */ +-static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq, +- struct folio_queue *folioq) +-{ +- unsigned int order, nr; +- size_t size = 0; +- +- nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios, +- ARRAY_SIZE(folioq->vec.folios)); +- folioq->vec.nr = nr; +- for (int i = 0; i < nr; i++) { +- struct folio *folio = folioq_folio(folioq, i); +- +- trace_netfs_folio(folio, netfs_folio_trace_read); +- order = folio_order(folio); +- folioq->orders[i] = order; +- size += PAGE_SIZE << order; +- } ++ for (;;) { ++ loff_t sreq_end; + +- for (int i = nr; i < folioq_nr_slots(folioq); i++) +- folioq_clear(folioq, i); ++ if (!subreq) { ++ pg_failed = true; ++ break; ++ } + +- return size; +-} ++ if (!folio_started && ++ test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) && ++ fscache_operation_valid(&rreq->cache_resources)) { ++ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); ++ folio_start_private_2(folio); ++ folio_started = true; ++ } + +-/* +- * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O +- * @subreq: The subrequest to be set up +- * +- * Prepare the I/O iterator representing the read buffer on a subrequest for +- * the filesystem to use for I/O (it can be passed directly to a socket). This +- * is intended to be called from the ->issue_read() method once the filesystem +- * has trimmed the request to the size it wants. +- * +- * Returns the limited size if successful and -ENOMEM if insufficient memory +- * available. +- * +- * [!] NOTE: This must be run in the same thread as ->issue_read() was called +- * in as we access the readahead_control struct. +- */ +-static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) +-{ +- struct netfs_io_request *rreq = subreq->rreq; +- size_t rsize = subreq->len; +- +- if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER) +- rsize = umin(rsize, rreq->io_streams[0].sreq_max_len); +- +- if (rreq->ractl) { +- /* If we don't have sufficient folios in the rolling buffer, +- * extract a folioq's worth from the readahead region at a time +- * into the buffer. Note that this acquires a ref on each page +- * that we will need to release later - but we don't want to do +- * that until after we've started the I/O. +- */ +- while (rreq->submitted < subreq->start + rsize) { +- struct folio_queue *tail = rreq->buffer_tail, *new; +- size_t added; +- +- new = kmalloc(sizeof(*new), GFP_NOFS); +- if (!new) +- return -ENOMEM; +- netfs_stat(&netfs_n_folioq); +- folioq_init(new); +- new->prev = tail; +- tail->next = new; +- rreq->buffer_tail = new; +- added = netfs_load_buffer_from_ra(rreq, new); +- rreq->iter.count += added; +- rreq->submitted += added; +- } +- } ++ pg_failed |= subreq_failed; ++ sreq_end = subreq->start + subreq->len - 1; ++ if (pg_end < sreq_end) ++ break; + +- subreq->len = rsize; +- if (unlikely(rreq->io_streams[0].sreq_max_segs)) { +- size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, +- rreq->io_streams[0].sreq_max_segs); ++ *account += subreq->transferred; ++ if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { ++ subreq = list_next_entry(subreq, rreq_link); ++ subreq_failed = (subreq->error < 0); ++ } else { ++ subreq = NULL; ++ subreq_failed = false; ++ } + +- if (limit < rsize) { +- subreq->len = limit; +- trace_netfs_sreq(subreq, netfs_sreq_trace_limited); ++ if (pg_end == sreq_end) ++ break; + } +- } + +- subreq->io_iter = rreq->iter; ++ if (!pg_failed) { ++ flush_dcache_folio(folio); ++ folio_mark_uptodate(folio); ++ } + +- if (iov_iter_is_folioq(&subreq->io_iter)) { +- if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) { +- subreq->io_iter.folioq = subreq->io_iter.folioq->next; +- subreq->io_iter.folioq_slot = 0; ++ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { ++ if (folio->index == rreq->no_unlock_folio && ++ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) ++ _debug("no unlock"); ++ else ++ folio_unlock(folio); + } +- subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq; +- subreq->curr_folioq_slot = subreq->io_iter.folioq_slot; +- subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; + } +- +- iov_iter_truncate(&subreq->io_iter, subreq->len); +- iov_iter_advance(&rreq->iter, subreq->len); +- return subreq->len; ++ rcu_read_unlock(); + } + +-static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq, +- struct netfs_io_subrequest *subreq, +- loff_t i_size) ++/* ++ * Unlock the folios in a read operation. We need to set PG_writeback on any ++ * folios we're going to write back before we unlock them. ++ * ++ * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use ++ * PG_private_2 and do a direct write to the cache from here instead. ++ */ ++void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) + { +- struct netfs_cache_resources *cres = &rreq->cache_resources; ++ struct netfs_io_subrequest *subreq; ++ struct netfs_folio *finfo; ++ struct folio *folio; ++ pgoff_t start_page = rreq->start / PAGE_SIZE; ++ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; ++ size_t account = 0; ++ bool subreq_failed = false; + +- if (!cres->ops) +- return NETFS_DOWNLOAD_FROM_SERVER; +- return cres->ops->prepare_read(subreq, i_size); +-} ++ XA_STATE(xas, &rreq->mapping->i_pages, start_page); + +-static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, +- bool was_async) +-{ +- struct netfs_io_subrequest *subreq = priv; ++ if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { ++ __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); ++ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { ++ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); ++ } ++ } + +- if (transferred_or_error < 0) { +- netfs_read_subreq_terminated(subreq, transferred_or_error, was_async); +- return; ++ /* Handle deprecated PG_private_2 case. */ ++ if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { ++ netfs_rreq_unlock_folios_pgpriv2(rreq, &account); ++ goto out; + } + +- if (transferred_or_error > 0) +- subreq->transferred += transferred_or_error; +- netfs_read_subreq_terminated(subreq, 0, was_async); +-} ++ /* Walk through the pagecache and the I/O request lists simultaneously. ++ * We may have a mixture of cached and uncached sections and we only ++ * really want to write out the uncached sections. This is slightly ++ * complicated by the possibility that we might have huge pages with a ++ * mixture inside. ++ */ ++ subreq = list_first_entry(&rreq->subrequests, ++ struct netfs_io_subrequest, rreq_link); ++ subreq_failed = (subreq->error < 0); + +-/* +- * Issue a read against the cache. +- * - Eats the caller's ref on subreq. +- */ +-static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, +- struct netfs_io_subrequest *subreq) +-{ +- struct netfs_cache_resources *cres = &rreq->cache_resources; ++ trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); + +- netfs_stat(&netfs_n_rh_read); +- cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE, +- netfs_cache_read_terminated, subreq); +-} ++ rcu_read_lock(); ++ xas_for_each(&xas, folio, last_page) { ++ loff_t pg_end; ++ bool pg_failed = false; ++ bool wback_to_cache = false; + +-/* +- * Perform a read to the pagecache from a series of sources of different types, +- * slicing up the region to be read according to available cache blocks and +- * network rsize. +- */ +-static void netfs_read_to_pagecache(struct netfs_io_request *rreq) +-{ +- struct netfs_inode *ictx = netfs_inode(rreq->inode); +- unsigned long long start = rreq->start; +- ssize_t size = rreq->len; +- int ret = 0; +- +- atomic_inc(&rreq->nr_outstanding); +- +- do { +- struct netfs_io_subrequest *subreq; +- enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; +- ssize_t slice; +- +- subreq = netfs_alloc_subrequest(rreq); +- if (!subreq) { +- ret = -ENOMEM; +- break; +- } ++ if (xas_retry(&xas, folio)) ++ continue; + +- subreq->start = start; +- subreq->len = size; +- +- atomic_inc(&rreq->nr_outstanding); +- spin_lock_bh(&rreq->lock); +- list_add_tail(&subreq->rreq_link, &rreq->subrequests); +- subreq->prev_donated = rreq->prev_donated; +- rreq->prev_donated = 0; +- trace_netfs_sreq(subreq, netfs_sreq_trace_added); +- spin_unlock_bh(&rreq->lock); +- +- source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); +- subreq->source = source; +- if (source == NETFS_DOWNLOAD_FROM_SERVER) { +- unsigned long long zp = umin(ictx->zero_point, rreq->i_size); +- size_t len = subreq->len; +- +- if (subreq->start >= zp) { +- subreq->source = source = NETFS_FILL_WITH_ZEROES; +- goto fill_with_zeroes; +- } ++ pg_end = folio_pos(folio) + folio_size(folio) - 1; ++ ++ for (;;) { ++ loff_t sreq_end; + +- if (len > zp - subreq->start) +- len = zp - subreq->start; +- if (len == 0) { +- pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx", +- rreq->debug_id, subreq->debug_index, +- subreq->len, size, +- subreq->start, ictx->zero_point, rreq->i_size); ++ if (!subreq) { ++ pg_failed = true; + break; + } +- subreq->len = len; +- +- netfs_stat(&netfs_n_rh_download); +- if (rreq->netfs_ops->prepare_read) { +- ret = rreq->netfs_ops->prepare_read(subreq); +- if (ret < 0) { +- atomic_dec(&rreq->nr_outstanding); +- netfs_put_subrequest(subreq, false, +- netfs_sreq_trace_put_cancel); +- break; +- } +- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); +- } + +- slice = netfs_prepare_read_iterator(subreq); +- if (slice < 0) { +- atomic_dec(&rreq->nr_outstanding); +- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); +- ret = slice; ++ wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); ++ pg_failed |= subreq_failed; ++ sreq_end = subreq->start + subreq->len - 1; ++ if (pg_end < sreq_end) + break; ++ ++ account += subreq->transferred; ++ if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { ++ subreq = list_next_entry(subreq, rreq_link); ++ subreq_failed = (subreq->error < 0); ++ } else { ++ subreq = NULL; ++ subreq_failed = false; + } + +- rreq->netfs_ops->issue_read(subreq); +- goto done; ++ if (pg_end == sreq_end) ++ break; + } + +- fill_with_zeroes: +- if (source == NETFS_FILL_WITH_ZEROES) { +- subreq->source = NETFS_FILL_WITH_ZEROES; +- trace_netfs_sreq(subreq, netfs_sreq_trace_submit); +- netfs_stat(&netfs_n_rh_zero); +- slice = netfs_prepare_read_iterator(subreq); +- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); +- netfs_read_subreq_terminated(subreq, 0, false); +- goto done; ++ if (!pg_failed) { ++ flush_dcache_folio(folio); ++ finfo = netfs_folio_info(folio); ++ if (finfo) { ++ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); ++ if (finfo->netfs_group) ++ folio_change_private(folio, finfo->netfs_group); ++ else ++ folio_detach_private(folio); ++ kfree(finfo); ++ } ++ folio_mark_uptodate(folio); ++ if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) { ++ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); ++ folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); ++ filemap_dirty_folio(folio->mapping, folio); ++ } + } + +- if (source == NETFS_READ_FROM_CACHE) { +- trace_netfs_sreq(subreq, netfs_sreq_trace_submit); +- slice = netfs_prepare_read_iterator(subreq); +- netfs_read_cache_to_pagecache(rreq, subreq); +- goto done; ++ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { ++ if (folio->index == rreq->no_unlock_folio && ++ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) ++ _debug("no unlock"); ++ else ++ folio_unlock(folio); + } ++ } ++ rcu_read_unlock(); + +- pr_err("Unexpected read source %u\n", source); +- WARN_ON_ONCE(1); +- break; +- +- done: +- size -= slice; +- start += slice; +- cond_resched(); +- } while (size > 0); ++out: ++ task_io_account_read(account); ++ if (rreq->netfs_ops->done) ++ rreq->netfs_ops->done(rreq); ++} + +- if (atomic_dec_and_test(&rreq->nr_outstanding)) +- netfs_rreq_terminated(rreq, false); ++static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, ++ unsigned long long *_start, ++ unsigned long long *_len, ++ unsigned long long i_size) ++{ ++ struct netfs_cache_resources *cres = &rreq->cache_resources; + +- /* Defer error return as we may need to wait for outstanding I/O. */ +- cmpxchg(&rreq->error, 0, ret); ++ if (cres->ops && cres->ops->expand_readahead) ++ cres->ops->expand_readahead(cres, _start, _len, i_size); + } + +-/* +- * Wait for the read operation to complete, successfully or otherwise. +- */ +-static int netfs_wait_for_read(struct netfs_io_request *rreq) ++static void netfs_rreq_expand(struct netfs_io_request *rreq, ++ struct readahead_control *ractl) + { +- int ret; ++ /* Give the cache a chance to change the request parameters. The ++ * resultant request must contain the original region. ++ */ ++ netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); + +- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); +- wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); +- ret = rreq->error; +- if (ret == 0 && rreq->submitted < rreq->len) { +- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); +- ret = -EIO; +- } ++ /* Give the netfs a chance to change the request parameters. The ++ * resultant request must contain the original region. ++ */ ++ if (rreq->netfs_ops->expand_readahead) ++ rreq->netfs_ops->expand_readahead(rreq); + +- return ret; +-} ++ /* Expand the request if the cache wants it to start earlier. Note ++ * that the expansion may get further extended if the VM wishes to ++ * insert THPs and the preferred start and/or end wind up in the middle ++ * of THPs. ++ * ++ * If this is the case, however, the THP size should be an integer ++ * multiple of the cache granule size, so we get a whole number of ++ * granules to deal with. ++ */ ++ if (rreq->start != readahead_pos(ractl) || ++ rreq->len != readahead_length(ractl)) { ++ readahead_expand(ractl, rreq->start, rreq->len); ++ rreq->start = readahead_pos(ractl); ++ rreq->len = readahead_length(ractl); + +-/* +- * Set up the initial folioq of buffer folios in the rolling buffer and set the +- * iterator to refer to it. +- */ +-static int netfs_prime_buffer(struct netfs_io_request *rreq) +-{ +- struct folio_queue *folioq; +- size_t added; +- +- folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); +- if (!folioq) +- return -ENOMEM; +- netfs_stat(&netfs_n_folioq); +- folioq_init(folioq); +- rreq->buffer = folioq; +- rreq->buffer_tail = folioq; +- rreq->submitted = rreq->start; +- iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0); +- +- added = netfs_load_buffer_from_ra(rreq, folioq); +- rreq->iter.count += added; +- rreq->submitted += added; +- return 0; ++ trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), ++ netfs_read_trace_expanded); ++ } + } + + /* +- * Drop the ref on each folio that we inherited from the VM readahead code. We +- * still have the folio locks to pin the page until we complete the I/O. +- * +- * Note that we can't just release the batch in each queue struct as we use the +- * occupancy count in other places. ++ * Begin an operation, and fetch the stored zero point value from the cookie if ++ * available. + */ +-static void netfs_put_ra_refs(struct folio_queue *folioq) ++static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) + { +- struct folio_batch fbatch; +- +- folio_batch_init(&fbatch); +- while (folioq) { +- for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) { +- struct folio *folio = folioq_folio(folioq, slot); +- if (!folio) +- continue; +- trace_netfs_folio(folio, netfs_folio_trace_read_put); +- if (!folio_batch_add(&fbatch, folio)) +- folio_batch_release(&fbatch); +- } +- folioq = folioq->next; +- } +- +- folio_batch_release(&fbatch); ++ return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); + } + + /** +@@ -411,17 +289,22 @@ static void netfs_put_ra_refs(struct folio_queue *folioq) + void netfs_readahead(struct readahead_control *ractl) + { + struct netfs_io_request *rreq; +- struct netfs_inode *ictx = netfs_inode(ractl->mapping->host); +- unsigned long long start = readahead_pos(ractl); +- size_t size = readahead_length(ractl); ++ struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); + int ret; + +- rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size, ++ _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); ++ ++ if (readahead_count(ractl) == 0) ++ return; ++ ++ rreq = netfs_alloc_request(ractl->mapping, ractl->file, ++ readahead_pos(ractl), ++ readahead_length(ractl), + NETFS_READAHEAD); + if (IS_ERR(rreq)) + return; + +- ret = netfs_begin_cache_read(rreq, ictx); ++ ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; + +@@ -431,15 +314,18 @@ void netfs_readahead(struct readahead_control *ractl) + + netfs_rreq_expand(rreq, ractl); + +- rreq->ractl = ractl; +- if (netfs_prime_buffer(rreq) < 0) +- goto cleanup_free; +- netfs_read_to_pagecache(rreq); ++ /* Set up the output buffer */ ++ iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, ++ rreq->start, rreq->len); + +- /* Release the folio refs whilst we're waiting for the I/O. */ +- netfs_put_ra_refs(rreq->buffer); ++ /* Drop the refs on the folios here rather than in the cache or ++ * filesystem. The locks will be dropped in netfs_rreq_unlock(). ++ */ ++ while (readahead_folio(ractl)) ++ ; + +- netfs_put_request(rreq, true, netfs_rreq_trace_put_return); ++ netfs_begin_read(rreq, false); ++ netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return; + + cleanup_free: +@@ -448,117 +334,6 @@ void netfs_readahead(struct readahead_control *ractl) + } + EXPORT_SYMBOL(netfs_readahead); + +-/* +- * Create a rolling buffer with a single occupying folio. +- */ +-static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio) +-{ +- struct folio_queue *folioq; +- +- folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); +- if (!folioq) +- return -ENOMEM; +- +- netfs_stat(&netfs_n_folioq); +- folioq_init(folioq); +- folioq_append(folioq, folio); +- BUG_ON(folioq_folio(folioq, 0) != folio); +- BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio)); +- rreq->buffer = folioq; +- rreq->buffer_tail = folioq; +- rreq->submitted = rreq->start + rreq->len; +- iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len); +- rreq->ractl = (struct readahead_control *)1UL; +- return 0; +-} +- +-/* +- * Read into gaps in a folio partially filled by a streaming write. +- */ +-static int netfs_read_gaps(struct file *file, struct folio *folio) +-{ +- struct netfs_io_request *rreq; +- struct address_space *mapping = folio->mapping; +- struct netfs_folio *finfo = netfs_folio_info(folio); +- struct netfs_inode *ctx = netfs_inode(mapping->host); +- struct folio *sink = NULL; +- struct bio_vec *bvec; +- unsigned int from = finfo->dirty_offset; +- unsigned int to = from + finfo->dirty_len; +- unsigned int off = 0, i = 0; +- size_t flen = folio_size(folio); +- size_t nr_bvec = flen / PAGE_SIZE + 2; +- size_t part; +- int ret; +- +- _enter("%lx", folio->index); +- +- rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS); +- if (IS_ERR(rreq)) { +- ret = PTR_ERR(rreq); +- goto alloc_error; +- } +- +- ret = netfs_begin_cache_read(rreq, ctx); +- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) +- goto discard; +- +- netfs_stat(&netfs_n_rh_read_folio); +- trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps); +- +- /* Fiddle the buffer so that a gap at the beginning and/or a gap at the +- * end get copied to, but the middle is discarded. +- */ +- ret = -ENOMEM; +- bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); +- if (!bvec) +- goto discard; +- +- sink = folio_alloc(GFP_KERNEL, 0); +- if (!sink) { +- kfree(bvec); +- goto discard; +- } +- +- trace_netfs_folio(folio, netfs_folio_trace_read_gaps); +- +- rreq->direct_bv = bvec; +- rreq->direct_bv_count = nr_bvec; +- if (from > 0) { +- bvec_set_folio(&bvec[i++], folio, from, 0); +- off = from; +- } +- while (off < to) { +- part = min_t(size_t, to - off, PAGE_SIZE); +- bvec_set_folio(&bvec[i++], sink, part, 0); +- off += part; +- } +- if (to < flen) +- bvec_set_folio(&bvec[i++], folio, flen - to, to); +- iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); +- rreq->submitted = rreq->start + flen; +- +- netfs_read_to_pagecache(rreq); +- +- if (sink) +- folio_put(sink); +- +- ret = netfs_wait_for_read(rreq); +- if (ret == 0) { +- flush_dcache_folio(folio); +- folio_mark_uptodate(folio); +- } +- folio_unlock(folio); +- netfs_put_request(rreq, false, netfs_rreq_trace_put_return); +- return ret < 0 ? ret : 0; +- +-discard: +- netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +-alloc_error: +- folio_unlock(folio); +- return ret; +-} +- + /** + * netfs_read_folio - Helper to manage a read_folio request + * @file: The file to read from +@@ -578,13 +353,9 @@ int netfs_read_folio(struct file *file, struct folio *folio) + struct address_space *mapping = folio->mapping; + struct netfs_io_request *rreq; + struct netfs_inode *ctx = netfs_inode(mapping->host); ++ struct folio *sink = NULL; + int ret; + +- if (folio_test_dirty(folio)) { +- trace_netfs_folio(folio, netfs_folio_trace_read_gaps); +- return netfs_read_gaps(file, folio); +- } +- + _enter("%lx", folio->index); + + rreq = netfs_alloc_request(mapping, file, +@@ -603,12 +374,54 @@ int netfs_read_folio(struct file *file, struct folio *folio) + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); + + /* Set up the output buffer */ +- ret = netfs_create_singular_buffer(rreq, folio); +- if (ret < 0) +- goto discard; ++ if (folio_test_dirty(folio)) { ++ /* Handle someone trying to read from an unflushed streaming ++ * write. We fiddle the buffer so that a gap at the beginning ++ * and/or a gap at the end get copied to, but the middle is ++ * discarded. ++ */ ++ struct netfs_folio *finfo = netfs_folio_info(folio); ++ struct bio_vec *bvec; ++ unsigned int from = finfo->dirty_offset; ++ unsigned int to = from + finfo->dirty_len; ++ unsigned int off = 0, i = 0; ++ size_t flen = folio_size(folio); ++ size_t nr_bvec = flen / PAGE_SIZE + 2; ++ size_t part; ++ ++ ret = -ENOMEM; ++ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); ++ if (!bvec) ++ goto discard; ++ ++ sink = folio_alloc(GFP_KERNEL, 0); ++ if (!sink) ++ goto discard; ++ ++ trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + +- netfs_read_to_pagecache(rreq); +- ret = netfs_wait_for_read(rreq); ++ rreq->direct_bv = bvec; ++ rreq->direct_bv_count = nr_bvec; ++ if (from > 0) { ++ bvec_set_folio(&bvec[i++], folio, from, 0); ++ off = from; ++ } ++ while (off < to) { ++ part = min_t(size_t, to - off, PAGE_SIZE); ++ bvec_set_folio(&bvec[i++], sink, part, 0); ++ off += part; ++ } ++ if (to < flen) ++ bvec_set_folio(&bvec[i++], folio, flen - to, to); ++ iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); ++ } else { ++ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, ++ rreq->start, rreq->len); ++ } ++ ++ ret = netfs_begin_read(rreq, true); ++ if (sink) ++ folio_put(sink); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret < 0 ? ret : 0; + +@@ -681,10 +494,13 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, + * + * Pre-read data for a write-begin request by drawing data from the cache if + * possible, or the netfs if not. Space beyond the EOF is zero-filled. +- * Multiple I/O requests from different sources will get munged together. ++ * Multiple I/O requests from different sources will get munged together. If ++ * necessary, the readahead window can be expanded in either direction to a ++ * more convenient alighment for RPC efficiency or to make storage in the cache ++ * feasible. + * + * The calling netfs must provide a table of operations, only one of which, +- * issue_read, is mandatory. ++ * issue_op, is mandatory. + * + * The check_write_begin() operation can be provided to check for and flush + * conflicting writes once the folio is grabbed and locked. It is passed a +@@ -712,6 +528,8 @@ int netfs_write_begin(struct netfs_inode *ctx, + pgoff_t index = pos >> PAGE_SHIFT; + int ret; + ++ DEFINE_READAHEAD(ractl, file, NULL, mapping, index); ++ + retry: + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); +@@ -759,13 +577,22 @@ int netfs_write_begin(struct netfs_inode *ctx, + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); + ++ /* Expand the request to meet caching requirements and download ++ * preferences. ++ */ ++ ractl._nr_pages = folio_nr_pages(folio); ++ netfs_rreq_expand(rreq, &ractl); ++ + /* Set up the output buffer */ +- ret = netfs_create_singular_buffer(rreq, folio); +- if (ret < 0) +- goto error_put; ++ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, ++ rreq->start, rreq->len); + +- netfs_read_to_pagecache(rreq); +- ret = netfs_wait_for_read(rreq); ++ /* We hold the folio locks, so we can drop the references */ ++ folio_get(folio); ++ while (readahead_folio(&ractl)) ++ ; ++ ++ ret = netfs_begin_read(rreq, true); + if (ret < 0) + goto error; + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); +@@ -825,13 +652,10 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, + trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); + + /* Set up the output buffer */ +- ret = netfs_create_singular_buffer(rreq, folio); +- if (ret < 0) +- goto error_put; ++ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, ++ rreq->start, rreq->len); + +- folioq_mark2(rreq->buffer, 0); +- netfs_read_to_pagecache(rreq); +- ret = netfs_wait_for_read(rreq); ++ ret = netfs_begin_read(rreq, true); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret; + +diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c +index d7eae597e54d..ca53c5d1622e 100644 +--- a/fs/netfs/buffered_write.c ++++ b/fs/netfs/buffered_write.c +@@ -13,22 +13,91 @@ + #include + #include "internal.h" + +-static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) ++/* ++ * Determined write method. Adjust netfs_folio_traces if this is changed. ++ */ ++enum netfs_how_to_modify { ++ NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ ++ NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ ++ NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ ++ NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ ++ NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ ++ NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ ++ NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ ++}; ++ ++static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) + { +- if (netfs_group) ++ void *priv = folio_get_private(folio); ++ ++ if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) + folio_attach_private(folio, netfs_get_group(netfs_group)); ++ else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) ++ folio_detach_private(folio); + } + +-static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) ++/* ++ * Decide how we should modify a folio. We might be attempting to do ++ * write-streaming, in which case we don't want to a local RMW cycle if we can ++ * avoid it. If we're doing local caching or content crypto, we award that ++ * priority over avoiding RMW. If the file is open readably, then we also ++ * assume that we may want to read what we wrote. ++ */ ++static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, ++ struct file *file, ++ struct folio *folio, ++ void *netfs_group, ++ size_t flen, ++ size_t offset, ++ size_t len, ++ bool maybe_trouble) + { +- void *priv = folio_get_private(folio); ++ struct netfs_folio *finfo = netfs_folio_info(folio); ++ struct netfs_group *group = netfs_folio_group(folio); ++ loff_t pos = folio_pos(folio); ++ ++ _enter(""); ++ ++ if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) ++ return NETFS_FLUSH_CONTENT; + +- if (unlikely(priv != netfs_group)) { +- if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) +- folio_attach_private(folio, netfs_get_group(netfs_group)); +- else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) +- folio_detach_private(folio); ++ if (folio_test_uptodate(folio)) ++ return NETFS_FOLIO_IS_UPTODATE; ++ ++ if (pos >= ctx->zero_point) ++ return NETFS_MODIFY_AND_CLEAR; ++ ++ if (!maybe_trouble && offset == 0 && len >= flen) ++ return NETFS_WHOLE_FOLIO_MODIFY; ++ ++ if (file->f_mode & FMODE_READ) ++ goto no_write_streaming; ++ ++ if (netfs_is_cache_enabled(ctx)) { ++ /* We don't want to get a streaming write on a file that loses ++ * caching service temporarily because the backing store got ++ * culled. ++ */ ++ goto no_write_streaming; + } ++ ++ if (!finfo) ++ return NETFS_STREAMING_WRITE; ++ ++ /* We can continue a streaming write only if it continues on from the ++ * previous. If it overlaps, we must flush lest we suffer a partial ++ * copy and disjoint dirty regions. ++ */ ++ if (offset == finfo->dirty_offset + finfo->dirty_len) ++ return NETFS_STREAMING_WRITE_CONT; ++ return NETFS_FLUSH_CONTENT; ++ ++no_write_streaming: ++ if (finfo) { ++ netfs_stat(&netfs_n_wh_wstream_conflict); ++ return NETFS_FLUSH_CONTENT; ++ } ++ return NETFS_JUST_PREFETCH; + } + + /* +@@ -108,10 +177,13 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + .range_end = iocb->ki_pos + iter->count, + }; + struct netfs_io_request *wreq = NULL; +- struct folio *folio = NULL, *writethrough = NULL; ++ struct netfs_folio *finfo; ++ struct folio *folio, *writethrough = NULL; ++ enum netfs_how_to_modify howto; ++ enum netfs_folio_trace trace; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; + ssize_t written = 0, ret, ret2; +- loff_t i_size, pos = iocb->ki_pos; ++ loff_t i_size, pos = iocb->ki_pos, from, to; + size_t max_chunk = mapping_max_folio_size(mapping); + bool maybe_trouble = false; + +@@ -141,14 +213,15 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + } + + do { +- struct netfs_folio *finfo; +- struct netfs_group *group; +- unsigned long long fpos; + size_t flen; + size_t offset; /* Offset into pagecache folio */ + size_t part; /* Bytes to write to folio */ + size_t copied; /* Bytes copied from user */ + ++ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); ++ if (unlikely(ret < 0)) ++ break; ++ + offset = pos & (max_chunk - 1); + part = min(max_chunk - offset, iov_iter_count(iter)); + +@@ -174,8 +247,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + } + + flen = folio_size(folio); +- fpos = folio_pos(folio); +- offset = pos - fpos; ++ offset = pos & (flen - 1); + part = min_t(size_t, flen - offset, part); + + /* Wait for writeback to complete. The writeback engine owns +@@ -193,52 +265,71 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + goto error_folio_unlock; + } + +- /* Decide how we should modify a folio. We might be attempting +- * to do write-streaming, in which case we don't want to a +- * local RMW cycle if we can avoid it. If we're doing local +- * caching or content crypto, we award that priority over +- * avoiding RMW. If the file is open readably, then we also +- * assume that we may want to read what we wrote. ++ /* See if we need to prefetch the area we're going to modify. ++ * We need to do this before we get a lock on the folio in case ++ * there's more than one writer competing for the same cache ++ * block. + */ +- finfo = netfs_folio_info(folio); +- group = netfs_folio_group(folio); +- +- if (unlikely(group != netfs_group) && +- group != NETFS_FOLIO_COPY_TO_CACHE) +- goto flush_content; +- +- if (folio_test_uptodate(folio)) { +- if (mapping_writably_mapped(mapping)) +- flush_dcache_folio(folio); +- copied = copy_folio_from_iter_atomic(folio, offset, part, iter); +- if (unlikely(copied == 0)) +- goto copy_failed; +- netfs_set_group(folio, netfs_group); +- trace_netfs_folio(folio, netfs_folio_is_uptodate); +- goto copied; ++ howto = netfs_how_to_modify(ctx, file, folio, netfs_group, ++ flen, offset, part, maybe_trouble); ++ _debug("howto %u", howto); ++ switch (howto) { ++ case NETFS_JUST_PREFETCH: ++ ret = netfs_prefetch_for_write(file, folio, offset, part); ++ if (ret < 0) { ++ _debug("prefetch = %zd", ret); ++ goto error_folio_unlock; ++ } ++ break; ++ case NETFS_FOLIO_IS_UPTODATE: ++ case NETFS_WHOLE_FOLIO_MODIFY: ++ case NETFS_STREAMING_WRITE_CONT: ++ break; ++ case NETFS_MODIFY_AND_CLEAR: ++ zero_user_segment(&folio->page, 0, offset); ++ break; ++ case NETFS_STREAMING_WRITE: ++ ret = -EIO; ++ if (WARN_ON(folio_get_private(folio))) ++ goto error_folio_unlock; ++ break; ++ case NETFS_FLUSH_CONTENT: ++ trace_netfs_folio(folio, netfs_flush_content); ++ from = folio_pos(folio); ++ to = from + folio_size(folio) - 1; ++ folio_unlock(folio); ++ folio_put(folio); ++ ret = filemap_write_and_wait_range(mapping, from, to); ++ if (ret < 0) ++ goto error_folio_unlock; ++ continue; + } + +- /* If the page is above the zero-point then we assume that the +- * server would just return a block of zeros or a short read if +- * we try to read it. +- */ +- if (fpos >= ctx->zero_point) { +- zero_user_segment(&folio->page, 0, offset); +- copied = copy_folio_from_iter_atomic(folio, offset, part, iter); +- if (unlikely(copied == 0)) +- goto copy_failed; +- zero_user_segment(&folio->page, offset + copied, flen); +- __netfs_set_group(folio, netfs_group); +- folio_mark_uptodate(folio); +- trace_netfs_folio(folio, netfs_modify_and_clear); +- goto copied; ++ if (mapping_writably_mapped(mapping)) ++ flush_dcache_folio(folio); ++ ++ copied = copy_folio_from_iter_atomic(folio, offset, part, iter); ++ ++ flush_dcache_folio(folio); ++ ++ /* Deal with a (partially) failed copy */ ++ if (copied == 0) { ++ ret = -EFAULT; ++ goto error_folio_unlock; + } + +- /* See if we can write a whole folio in one go. */ +- if (!maybe_trouble && offset == 0 && part >= flen) { +- copied = copy_folio_from_iter_atomic(folio, offset, part, iter); +- if (unlikely(copied == 0)) +- goto copy_failed; ++ trace = (enum netfs_folio_trace)howto; ++ switch (howto) { ++ case NETFS_FOLIO_IS_UPTODATE: ++ case NETFS_JUST_PREFETCH: ++ netfs_set_group(folio, netfs_group); ++ break; ++ case NETFS_MODIFY_AND_CLEAR: ++ zero_user_segment(&folio->page, offset + copied, flen); ++ netfs_set_group(folio, netfs_group); ++ folio_mark_uptodate(folio); ++ break; ++ case NETFS_WHOLE_FOLIO_MODIFY: + if (unlikely(copied < part)) { + maybe_trouble = true; + iov_iter_revert(iter, copied); +@@ -246,53 +337,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + folio_unlock(folio); + goto retry; + } +- __netfs_set_group(folio, netfs_group); +- folio_mark_uptodate(folio); +- trace_netfs_folio(folio, netfs_whole_folio_modify); +- goto copied; +- } +- +- /* We don't want to do a streaming write on a file that loses +- * caching service temporarily because the backing store got +- * culled and we don't really want to get a streaming write on +- * a file that's open for reading as ->read_folio() then has to +- * be able to flush it. +- */ +- if ((file->f_mode & FMODE_READ) || +- netfs_is_cache_enabled(ctx)) { +- if (finfo) { +- netfs_stat(&netfs_n_wh_wstream_conflict); +- goto flush_content; +- } +- ret = netfs_prefetch_for_write(file, folio, offset, part); +- if (ret < 0) { +- _debug("prefetch = %zd", ret); +- goto error_folio_unlock; +- } +- /* Note that copy-to-cache may have been set. */ +- +- copied = copy_folio_from_iter_atomic(folio, offset, part, iter); +- if (unlikely(copied == 0)) +- goto copy_failed; + netfs_set_group(folio, netfs_group); +- trace_netfs_folio(folio, netfs_just_prefetch); +- goto copied; +- } +- +- if (!finfo) { +- ret = -EIO; +- if (WARN_ON(folio_get_private(folio))) +- goto error_folio_unlock; +- copied = copy_folio_from_iter_atomic(folio, offset, part, iter); +- if (unlikely(copied == 0)) +- goto copy_failed; ++ folio_mark_uptodate(folio); ++ break; ++ case NETFS_STREAMING_WRITE: + if (offset == 0 && copied == flen) { +- __netfs_set_group(folio, netfs_group); ++ netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); +- trace_netfs_folio(folio, netfs_streaming_filled_page); +- goto copied; ++ trace = netfs_streaming_filled_page; ++ break; + } +- + finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); + if (!finfo) { + iov_iter_revert(iter, copied); +@@ -304,18 +358,9 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + finfo->dirty_len = copied; + folio_attach_private(folio, (void *)((unsigned long)finfo | + NETFS_FOLIO_INFO)); +- trace_netfs_folio(folio, netfs_streaming_write); +- goto copied; +- } +- +- /* We can continue a streaming write only if it continues on +- * from the previous. If it overlaps, we must flush lest we +- * suffer a partial copy and disjoint dirty regions. +- */ +- if (offset == finfo->dirty_offset + finfo->dirty_len) { +- copied = copy_folio_from_iter_atomic(folio, offset, part, iter); +- if (unlikely(copied == 0)) +- goto copy_failed; ++ break; ++ case NETFS_STREAMING_WRITE_CONT: ++ finfo = netfs_folio_info(folio); + finfo->dirty_len += copied; + if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { + if (finfo->netfs_group) +@@ -324,25 +369,17 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + folio_detach_private(folio); + folio_mark_uptodate(folio); + kfree(finfo); +- trace_netfs_folio(folio, netfs_streaming_cont_filled_page); +- } else { +- trace_netfs_folio(folio, netfs_streaming_write_cont); ++ trace = netfs_streaming_cont_filled_page; + } +- goto copied; +- } +- +- /* Incompatible write; flush the folio and try again. */ +- flush_content: +- trace_netfs_folio(folio, netfs_flush_content); +- folio_unlock(folio); +- folio_put(folio); +- ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); +- if (ret < 0) ++ break; ++ default: ++ WARN(true, "Unexpected modify type %u ix=%lx\n", ++ howto, folio->index); ++ ret = -EIO; + goto error_folio_unlock; +- continue; ++ } + +- copied: +- flush_dcache_folio(folio); ++ trace_netfs_folio(folio, trace); + + /* Update the inode size if we moved the EOF marker */ + pos += copied; +@@ -364,22 +401,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + folio_put(folio); + folio = NULL; + +- ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); +- if (unlikely(ret < 0)) +- break; +- + cond_resched(); + } while (iov_iter_count(iter)); + + out: +- if (likely(written)) { +- /* Set indication that ctime and mtime got updated in case +- * close is deferred. +- */ +- set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags); +- if (unlikely(ctx->ops->post_modify)) +- ctx->ops->post_modify(inode); +- } ++ if (likely(written) && ctx->ops->post_modify) ++ ctx->ops->post_modify(inode); + + if (unlikely(wreq)) { + ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); +@@ -394,8 +421,6 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + _leave(" = %zd [%zd]", written, ret); + return written ? written : ret; + +-copy_failed: +- ret = -EFAULT; + error_folio_unlock: + folio_unlock(folio); + folio_put(folio); +diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c +index b1a66a6e6bc2..10a1e4da6bda 100644 +--- a/fs/netfs/direct_read.c ++++ b/fs/netfs/direct_read.c +@@ -16,143 +16,6 @@ + #include + #include "internal.h" + +-static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) +-{ +- struct netfs_io_request *rreq = subreq->rreq; +- size_t rsize; +- +- rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len); +- subreq->len = rsize; +- +- if (unlikely(rreq->io_streams[0].sreq_max_segs)) { +- size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, +- rreq->io_streams[0].sreq_max_segs); +- +- if (limit < rsize) { +- subreq->len = limit; +- trace_netfs_sreq(subreq, netfs_sreq_trace_limited); +- } +- } +- +- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); +- +- subreq->io_iter = rreq->iter; +- iov_iter_truncate(&subreq->io_iter, subreq->len); +- iov_iter_advance(&rreq->iter, subreq->len); +-} +- +-/* +- * Perform a read to a buffer from the server, slicing up the region to be read +- * according to the network rsize. +- */ +-static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) +-{ +- unsigned long long start = rreq->start; +- ssize_t size = rreq->len; +- int ret = 0; +- +- atomic_set(&rreq->nr_outstanding, 1); +- +- do { +- struct netfs_io_subrequest *subreq; +- ssize_t slice; +- +- subreq = netfs_alloc_subrequest(rreq); +- if (!subreq) { +- ret = -ENOMEM; +- break; +- } +- +- subreq->source = NETFS_DOWNLOAD_FROM_SERVER; +- subreq->start = start; +- subreq->len = size; +- +- atomic_inc(&rreq->nr_outstanding); +- spin_lock_bh(&rreq->lock); +- list_add_tail(&subreq->rreq_link, &rreq->subrequests); +- subreq->prev_donated = rreq->prev_donated; +- rreq->prev_donated = 0; +- trace_netfs_sreq(subreq, netfs_sreq_trace_added); +- spin_unlock_bh(&rreq->lock); +- +- netfs_stat(&netfs_n_rh_download); +- if (rreq->netfs_ops->prepare_read) { +- ret = rreq->netfs_ops->prepare_read(subreq); +- if (ret < 0) { +- atomic_dec(&rreq->nr_outstanding); +- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); +- break; +- } +- } +- +- netfs_prepare_dio_read_iterator(subreq); +- slice = subreq->len; +- rreq->netfs_ops->issue_read(subreq); +- +- size -= slice; +- start += slice; +- rreq->submitted += slice; +- +- if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && +- test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) +- break; +- cond_resched(); +- } while (size > 0); +- +- if (atomic_dec_and_test(&rreq->nr_outstanding)) +- netfs_rreq_terminated(rreq, false); +- return ret; +-} +- +-/* +- * Perform a read to an application buffer, bypassing the pagecache and the +- * local disk cache. +- */ +-static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) +-{ +- int ret; +- +- _enter("R=%x %llx-%llx", +- rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); +- +- if (rreq->len == 0) { +- pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); +- return -EIO; +- } +- +- // TODO: Use bounce buffer if requested +- +- inode_dio_begin(rreq->inode); +- +- ret = netfs_dispatch_unbuffered_reads(rreq); +- +- if (!rreq->submitted) { +- netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); +- inode_dio_end(rreq->inode); +- ret = 0; +- goto out; +- } +- +- if (sync) { +- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); +- wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, +- TASK_UNINTERRUPTIBLE); +- +- ret = rreq->error; +- if (ret == 0 && rreq->submitted < rreq->len && +- rreq->origin != NETFS_DIO_READ) { +- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); +- ret = -EIO; +- } +- } else { +- ret = -EIOCBQUEUED; +- } +- +-out: +- _leave(" = %d", ret); +- return ret; +-} +- + /** + * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read +@@ -168,7 +31,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i + struct netfs_io_request *rreq; + ssize_t ret; + size_t orig_count = iov_iter_count(iter); +- bool sync = is_sync_kiocb(iocb); ++ bool async = !is_sync_kiocb(iocb); + + _enter(""); + +@@ -215,13 +78,13 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i + + // TODO: Set up bounce buffer if needed + +- if (!sync) ++ if (async) + rreq->iocb = iocb; + +- ret = netfs_unbuffered_read(rreq, sync); ++ ret = netfs_begin_read(rreq, is_sync_kiocb(iocb)); + if (ret < 0) + goto out; /* May be -EIOCBQUEUED */ +- if (sync) { ++ if (!async) { + // TODO: Copy from bounce buffer + iocb->ki_pos += rreq->transferred; + ret = rreq->transferred; +@@ -231,6 +94,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + if (ret > 0) + orig_count -= ret; ++ if (ret != -EIOCBQUEUED) ++ iov_iter_revert(iter, orig_count - iov_iter_count(iter)); + return ret; + } + EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); +diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h +index c9f0ed24cb7b..7773f3d855a9 100644 +--- a/fs/netfs/internal.h ++++ b/fs/netfs/internal.h +@@ -7,7 +7,6 @@ + + #include + #include +-#include + #include + #include + #include +@@ -23,9 +22,15 @@ + /* + * buffered_read.c + */ ++void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); + int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len); + ++/* ++ * io.c ++ */ ++int netfs_begin_read(struct netfs_io_request *rreq, bool sync); ++ + /* + * main.c + */ +@@ -58,11 +63,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} + /* + * misc.c + */ +-int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, +- bool needs_put); +-struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq); +-void netfs_clear_buffer(struct netfs_io_request *rreq); +-void netfs_reset_iter(struct netfs_io_subrequest *subreq); + + /* + * objects.c +@@ -83,28 +83,6 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, + trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); + } + +-/* +- * read_collect.c +- */ +-void netfs_read_termination_worker(struct work_struct *work); +-void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async); +- +-/* +- * read_pgpriv2.c +- */ +-void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, +- struct netfs_io_request *rreq, +- struct folio_queue *folioq, +- int slot); +-void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq); +-bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq); +- +-/* +- * read_retry.c +- */ +-void netfs_retry_reads(struct netfs_io_request *rreq); +-void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq); +- + /* + * stats.c + */ +@@ -132,7 +110,6 @@ extern atomic_t netfs_n_wh_buffered_write; + extern atomic_t netfs_n_wh_writethrough; + extern atomic_t netfs_n_wh_dio_write; + extern atomic_t netfs_n_wh_writepages; +-extern atomic_t netfs_n_wh_copy_to_cache; + extern atomic_t netfs_n_wh_wstream_conflict; + extern atomic_t netfs_n_wh_upload; + extern atomic_t netfs_n_wh_upload_done; +@@ -140,9 +117,6 @@ extern atomic_t netfs_n_wh_upload_failed; + extern atomic_t netfs_n_wh_write; + extern atomic_t netfs_n_wh_write_done; + extern atomic_t netfs_n_wh_write_failed; +-extern atomic_t netfs_n_wb_lock_skip; +-extern atomic_t netfs_n_wb_lock_wait; +-extern atomic_t netfs_n_folioq; + + int netfs_stats_show(struct seq_file *m, void *v); + +@@ -176,10 +150,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, + loff_t start, + enum netfs_io_origin origin); + void netfs_reissue_write(struct netfs_io_stream *stream, +- struct netfs_io_subrequest *subreq, +- struct iov_iter *source); +-void netfs_issue_write(struct netfs_io_request *wreq, +- struct netfs_io_stream *stream); ++ struct netfs_io_subrequest *subreq); + int netfs_advance_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start, size_t len, bool to_eof); +diff --git a/fs/netfs/io.c b/fs/netfs/io.c +new file mode 100644 +index 000000000000..943128507af5 +--- /dev/null ++++ b/fs/netfs/io.c +@@ -0,0 +1,804 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* Network filesystem high-level read support. ++ * ++ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. ++ * Written by David Howells (dhowells@redhat.com) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "internal.h" ++ ++/* ++ * Clear the unread part of an I/O request. ++ */ ++static void netfs_clear_unread(struct netfs_io_subrequest *subreq) ++{ ++ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); ++} ++ ++static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, ++ bool was_async) ++{ ++ struct netfs_io_subrequest *subreq = priv; ++ ++ netfs_subreq_terminated(subreq, transferred_or_error, was_async); ++} ++ ++/* ++ * Issue a read against the cache. ++ * - Eats the caller's ref on subreq. ++ */ ++static void netfs_read_from_cache(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq, ++ enum netfs_read_from_hole read_hole) ++{ ++ struct netfs_cache_resources *cres = &rreq->cache_resources; ++ ++ netfs_stat(&netfs_n_rh_read); ++ cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole, ++ netfs_cache_read_terminated, subreq); ++} ++ ++/* ++ * Fill a subrequest region with zeroes. ++ */ ++static void netfs_fill_with_zeroes(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ ++ netfs_stat(&netfs_n_rh_zero); ++ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); ++ netfs_subreq_terminated(subreq, 0, false); ++} ++ ++/* ++ * Ask the netfs to issue a read request to the server for us. ++ * ++ * The netfs is expected to read from subreq->pos + subreq->transferred to ++ * subreq->pos + subreq->len - 1. It may not backtrack and write data into the ++ * buffer prior to the transferred point as it might clobber dirty data ++ * obtained from the cache. ++ * ++ * Alternatively, the netfs is allowed to indicate one of two things: ++ * ++ * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and ++ * make progress. ++ * ++ * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be ++ * cleared. ++ */ ++static void netfs_read_from_server(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ ++ netfs_stat(&netfs_n_rh_download); ++ ++ if (rreq->origin != NETFS_DIO_READ && ++ iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) ++ pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", ++ rreq->debug_id, subreq->debug_index, ++ iov_iter_count(&subreq->io_iter), subreq->len, ++ subreq->transferred, subreq->flags); ++ rreq->netfs_ops->issue_read(subreq); ++} ++ ++/* ++ * Release those waiting. ++ */ ++static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) ++{ ++ trace_netfs_rreq(rreq, netfs_rreq_trace_done); ++ netfs_clear_subrequests(rreq, was_async); ++ netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); ++} ++ ++/* ++ * [DEPRECATED] Deal with the completion of writing the data to the cache. We ++ * have to clear the PG_fscache bits on the folios involved and release the ++ * caller's ref. ++ * ++ * May be called in softirq mode and we inherit a ref from the caller. ++ */ ++static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, ++ bool was_async) ++{ ++ struct netfs_io_subrequest *subreq; ++ struct folio *folio; ++ pgoff_t unlocked = 0; ++ bool have_unlocked = false; ++ ++ rcu_read_lock(); ++ ++ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { ++ XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); ++ ++ xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { ++ if (xas_retry(&xas, folio)) ++ continue; ++ ++ /* We might have multiple writes from the same huge ++ * folio, but we mustn't unlock a folio more than once. ++ */ ++ if (have_unlocked && folio->index <= unlocked) ++ continue; ++ unlocked = folio_next_index(folio) - 1; ++ trace_netfs_folio(folio, netfs_folio_trace_end_copy); ++ folio_end_private_2(folio); ++ have_unlocked = true; ++ } ++ } ++ ++ rcu_read_unlock(); ++ netfs_rreq_completed(rreq, was_async); ++} ++ ++static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, ++ bool was_async) /* [DEPRECATED] */ ++{ ++ struct netfs_io_subrequest *subreq = priv; ++ struct netfs_io_request *rreq = subreq->rreq; ++ ++ if (IS_ERR_VALUE(transferred_or_error)) { ++ netfs_stat(&netfs_n_rh_write_failed); ++ trace_netfs_failure(rreq, subreq, transferred_or_error, ++ netfs_fail_copy_to_cache); ++ } else { ++ netfs_stat(&netfs_n_rh_write_done); ++ } ++ ++ trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); ++ ++ /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ ++ if (atomic_dec_and_test(&rreq->nr_copy_ops)) ++ netfs_rreq_unmark_after_write(rreq, was_async); ++ ++ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); ++} ++ ++/* ++ * [DEPRECATED] Perform any outstanding writes to the cache. We inherit a ref ++ * from the caller. ++ */ ++static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) ++{ ++ struct netfs_cache_resources *cres = &rreq->cache_resources; ++ struct netfs_io_subrequest *subreq, *next, *p; ++ struct iov_iter iter; ++ int ret; ++ ++ trace_netfs_rreq(rreq, netfs_rreq_trace_copy); ++ ++ /* We don't want terminating writes trying to wake us up whilst we're ++ * still going through the list. ++ */ ++ atomic_inc(&rreq->nr_copy_ops); ++ ++ list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { ++ if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { ++ list_del_init(&subreq->rreq_link); ++ netfs_put_subrequest(subreq, false, ++ netfs_sreq_trace_put_no_copy); ++ } ++ } ++ ++ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { ++ /* Amalgamate adjacent writes */ ++ while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { ++ next = list_next_entry(subreq, rreq_link); ++ if (next->start != subreq->start + subreq->len) ++ break; ++ subreq->len += next->len; ++ list_del_init(&next->rreq_link); ++ netfs_put_subrequest(next, false, ++ netfs_sreq_trace_put_merged); ++ } ++ ++ ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, ++ subreq->len, rreq->i_size, true); ++ if (ret < 0) { ++ trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); ++ trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); ++ continue; ++ } ++ ++ iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, ++ subreq->start, subreq->len); ++ ++ atomic_inc(&rreq->nr_copy_ops); ++ netfs_stat(&netfs_n_rh_write); ++ netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); ++ trace_netfs_sreq(subreq, netfs_sreq_trace_write); ++ cres->ops->write(cres, subreq->start, &iter, ++ netfs_rreq_copy_terminated, subreq); ++ } ++ ++ /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ ++ if (atomic_dec_and_test(&rreq->nr_copy_ops)) ++ netfs_rreq_unmark_after_write(rreq, false); ++} ++ ++static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */ ++{ ++ struct netfs_io_request *rreq = ++ container_of(work, struct netfs_io_request, work); ++ ++ netfs_rreq_do_write_to_cache(rreq); ++} ++ ++static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */ ++{ ++ rreq->work.func = netfs_rreq_write_to_cache_work; ++ if (!queue_work(system_unbound_wq, &rreq->work)) ++ BUG(); ++} ++ ++/* ++ * Handle a short read. ++ */ ++static void netfs_rreq_short_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ ++ __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); ++ __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); ++ ++ netfs_stat(&netfs_n_rh_short_read); ++ trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); ++ ++ netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read); ++ atomic_inc(&rreq->nr_outstanding); ++ if (subreq->source == NETFS_READ_FROM_CACHE) ++ netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); ++ else ++ netfs_read_from_server(rreq, subreq); ++} ++ ++/* ++ * Reset the subrequest iterator prior to resubmission. ++ */ ++static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ ++ size_t remaining = subreq->len - subreq->transferred; ++ size_t count = iov_iter_count(&subreq->io_iter); ++ ++ if (count == remaining) ++ return; ++ ++ _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", ++ rreq->debug_id, subreq->debug_index, ++ iov_iter_count(&subreq->io_iter), subreq->transferred, ++ subreq->len, rreq->i_size, ++ subreq->io_iter.iter_type); ++ ++ if (count < remaining) ++ iov_iter_revert(&subreq->io_iter, remaining - count); ++ else ++ iov_iter_advance(&subreq->io_iter, count - remaining); ++} ++ ++/* ++ * Resubmit any short or failed operations. Returns true if we got the rreq ++ * ref back. ++ */ ++static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) ++{ ++ struct netfs_io_subrequest *subreq; ++ ++ WARN_ON(in_interrupt()); ++ ++ trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); ++ ++ /* We don't want terminating submissions trying to wake us up whilst ++ * we're still going through the list. ++ */ ++ atomic_inc(&rreq->nr_outstanding); ++ ++ __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); ++ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { ++ if (subreq->error) { ++ if (subreq->source != NETFS_READ_FROM_CACHE) ++ break; ++ subreq->source = NETFS_DOWNLOAD_FROM_SERVER; ++ subreq->error = 0; ++ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); ++ netfs_stat(&netfs_n_rh_download_instead); ++ trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); ++ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); ++ atomic_inc(&rreq->nr_outstanding); ++ netfs_reset_subreq_iter(rreq, subreq); ++ netfs_read_from_server(rreq, subreq); ++ } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { ++ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); ++ netfs_reset_subreq_iter(rreq, subreq); ++ netfs_rreq_short_read(rreq, subreq); ++ } ++ } ++ ++ /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */ ++ if (atomic_dec_and_test(&rreq->nr_outstanding)) ++ return true; ++ ++ wake_up_var(&rreq->nr_outstanding); ++ return false; ++} ++ ++/* ++ * Check to see if the data read is still valid. ++ */ ++static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) ++{ ++ struct netfs_io_subrequest *subreq; ++ ++ if (!rreq->netfs_ops->is_still_valid || ++ rreq->netfs_ops->is_still_valid(rreq)) ++ return; ++ ++ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { ++ if (subreq->source == NETFS_READ_FROM_CACHE) { ++ subreq->error = -ESTALE; ++ __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); ++ } ++ } ++} ++ ++/* ++ * Determine how much we can admit to having read from a DIO read. ++ */ ++static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) ++{ ++ struct netfs_io_subrequest *subreq; ++ unsigned int i; ++ size_t transferred = 0; ++ ++ for (i = 0; i < rreq->direct_bv_count; i++) { ++ flush_dcache_page(rreq->direct_bv[i].bv_page); ++ // TODO: cifs marks pages in the destination buffer ++ // dirty under some circumstances after a read. Do we ++ // need to do that too? ++ set_page_dirty(rreq->direct_bv[i].bv_page); ++ } ++ ++ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { ++ if (subreq->error || subreq->transferred == 0) ++ break; ++ transferred += subreq->transferred; ++ if (subreq->transferred < subreq->len || ++ test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) ++ break; ++ } ++ ++ for (i = 0; i < rreq->direct_bv_count; i++) ++ flush_dcache_page(rreq->direct_bv[i].bv_page); ++ ++ rreq->transferred = transferred; ++ task_io_account_read(transferred); ++ ++ if (rreq->iocb) { ++ rreq->iocb->ki_pos += transferred; ++ if (rreq->iocb->ki_complete) ++ rreq->iocb->ki_complete( ++ rreq->iocb, rreq->error ? rreq->error : transferred); ++ } ++ if (rreq->netfs_ops->done) ++ rreq->netfs_ops->done(rreq); ++ inode_dio_end(rreq->inode); ++} ++ ++/* ++ * Assess the state of a read request and decide what to do next. ++ * ++ * Note that we could be in an ordinary kernel thread, on a workqueue or in ++ * softirq context at this point. We inherit a ref from the caller. ++ */ ++static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) ++{ ++ trace_netfs_rreq(rreq, netfs_rreq_trace_assess); ++ ++again: ++ netfs_rreq_is_still_valid(rreq); ++ ++ if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && ++ test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { ++ if (netfs_rreq_perform_resubmissions(rreq)) ++ goto again; ++ return; ++ } ++ ++ if (rreq->origin != NETFS_DIO_READ) ++ netfs_rreq_unlock_folios(rreq); ++ else ++ netfs_rreq_assess_dio(rreq); ++ ++ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); ++ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); ++ wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); ++ ++ if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) && ++ test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) ++ return netfs_rreq_write_to_cache(rreq); ++ ++ netfs_rreq_completed(rreq, was_async); ++} ++ ++static void netfs_rreq_work(struct work_struct *work) ++{ ++ struct netfs_io_request *rreq = ++ container_of(work, struct netfs_io_request, work); ++ netfs_rreq_assess(rreq, false); ++} ++ ++/* ++ * Handle the completion of all outstanding I/O operations on a read request. ++ * We inherit a ref from the caller. ++ */ ++static void netfs_rreq_terminated(struct netfs_io_request *rreq, ++ bool was_async) ++{ ++ if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && ++ was_async) { ++ if (!queue_work(system_unbound_wq, &rreq->work)) ++ BUG(); ++ } else { ++ netfs_rreq_assess(rreq, was_async); ++ } ++} ++ ++/** ++ * netfs_subreq_terminated - Note the termination of an I/O operation. ++ * @subreq: The I/O request that has terminated. ++ * @transferred_or_error: The amount of data transferred or an error code. ++ * @was_async: The termination was asynchronous ++ * ++ * This tells the read helper that a contributory I/O operation has terminated, ++ * one way or another, and that it should integrate the results. ++ * ++ * The caller indicates in @transferred_or_error the outcome of the operation, ++ * supplying a positive value to indicate the number of bytes transferred, 0 to ++ * indicate a failure to transfer anything that should be retried or a negative ++ * error code. The helper will look after reissuing I/O operations as ++ * appropriate and writing downloaded data to the cache. ++ * ++ * If @was_async is true, the caller might be running in softirq or interrupt ++ * context and we can't sleep. ++ */ ++void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, ++ ssize_t transferred_or_error, ++ bool was_async) ++{ ++ struct netfs_io_request *rreq = subreq->rreq; ++ int u; ++ ++ _enter("R=%x[%x]{%llx,%lx},%zd", ++ rreq->debug_id, subreq->debug_index, ++ subreq->start, subreq->flags, transferred_or_error); ++ ++ switch (subreq->source) { ++ case NETFS_READ_FROM_CACHE: ++ netfs_stat(&netfs_n_rh_read_done); ++ break; ++ case NETFS_DOWNLOAD_FROM_SERVER: ++ netfs_stat(&netfs_n_rh_download_done); ++ break; ++ default: ++ break; ++ } ++ ++ if (IS_ERR_VALUE(transferred_or_error)) { ++ subreq->error = transferred_or_error; ++ trace_netfs_failure(rreq, subreq, transferred_or_error, ++ netfs_fail_read); ++ goto failed; ++ } ++ ++ if (WARN(transferred_or_error > subreq->len - subreq->transferred, ++ "Subreq overread: R%x[%x] %zd > %zu - %zu", ++ rreq->debug_id, subreq->debug_index, ++ transferred_or_error, subreq->len, subreq->transferred)) ++ transferred_or_error = subreq->len - subreq->transferred; ++ ++ subreq->error = 0; ++ subreq->transferred += transferred_or_error; ++ if (subreq->transferred < subreq->len && ++ !test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) ++ goto incomplete; ++ ++complete: ++ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); ++ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) ++ set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); ++ ++out: ++ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); ++ ++ /* If we decrement nr_outstanding to 0, the ref belongs to us. */ ++ u = atomic_dec_return(&rreq->nr_outstanding); ++ if (u == 0) ++ netfs_rreq_terminated(rreq, was_async); ++ else if (u == 1) ++ wake_up_var(&rreq->nr_outstanding); ++ ++ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); ++ return; ++ ++incomplete: ++ if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { ++ netfs_clear_unread(subreq); ++ subreq->transferred = subreq->len; ++ goto complete; ++ } ++ ++ if (transferred_or_error == 0) { ++ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { ++ if (rreq->origin != NETFS_DIO_READ) ++ subreq->error = -ENODATA; ++ goto failed; ++ } ++ } else { ++ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); ++ } ++ ++ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); ++ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); ++ goto out; ++ ++failed: ++ if (subreq->source == NETFS_READ_FROM_CACHE) { ++ netfs_stat(&netfs_n_rh_read_failed); ++ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); ++ } else { ++ netfs_stat(&netfs_n_rh_download_failed); ++ set_bit(NETFS_RREQ_FAILED, &rreq->flags); ++ rreq->error = subreq->error; ++ } ++ goto out; ++} ++EXPORT_SYMBOL(netfs_subreq_terminated); ++ ++static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq, ++ loff_t i_size) ++{ ++ struct netfs_io_request *rreq = subreq->rreq; ++ struct netfs_cache_resources *cres = &rreq->cache_resources; ++ ++ if (cres->ops) ++ return cres->ops->prepare_read(subreq, i_size); ++ if (subreq->start >= rreq->i_size) ++ return NETFS_FILL_WITH_ZEROES; ++ return NETFS_DOWNLOAD_FROM_SERVER; ++} ++ ++/* ++ * Work out what sort of subrequest the next one will be. ++ */ ++static enum netfs_io_source ++netfs_rreq_prepare_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq, ++ struct iov_iter *io_iter) ++{ ++ enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; ++ struct netfs_inode *ictx = netfs_inode(rreq->inode); ++ size_t lsize; ++ ++ _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); ++ ++ if (rreq->origin != NETFS_DIO_READ) { ++ source = netfs_cache_prepare_read(subreq, rreq->i_size); ++ if (source == NETFS_INVALID_READ) ++ goto out; ++ } ++ ++ if (source == NETFS_DOWNLOAD_FROM_SERVER) { ++ /* Call out to the netfs to let it shrink the request to fit ++ * its own I/O sizes and boundaries. If it shinks it here, it ++ * will be called again to make simultaneous calls; if it wants ++ * to make serial calls, it can indicate a short read and then ++ * we will call it again. ++ */ ++ if (rreq->origin != NETFS_DIO_READ) { ++ if (subreq->start >= ictx->zero_point) { ++ source = NETFS_FILL_WITH_ZEROES; ++ goto set; ++ } ++ if (subreq->len > ictx->zero_point - subreq->start) ++ subreq->len = ictx->zero_point - subreq->start; ++ ++ /* We limit buffered reads to the EOF, but let the ++ * server deal with larger-than-EOF DIO/unbuffered ++ * reads. ++ */ ++ if (subreq->len > rreq->i_size - subreq->start) ++ subreq->len = rreq->i_size - subreq->start; ++ } ++ if (rreq->rsize && subreq->len > rreq->rsize) ++ subreq->len = rreq->rsize; ++ ++ if (rreq->netfs_ops->clamp_length && ++ !rreq->netfs_ops->clamp_length(subreq)) { ++ source = NETFS_INVALID_READ; ++ goto out; ++ } ++ ++ if (subreq->max_nr_segs) { ++ lsize = netfs_limit_iter(io_iter, 0, subreq->len, ++ subreq->max_nr_segs); ++ if (subreq->len > lsize) { ++ subreq->len = lsize; ++ trace_netfs_sreq(subreq, netfs_sreq_trace_limited); ++ } ++ } ++ } ++ ++set: ++ if (subreq->len > rreq->len) ++ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n", ++ rreq->debug_id, subreq->debug_index, ++ subreq->len, rreq->len); ++ ++ if (WARN_ON(subreq->len == 0)) { ++ source = NETFS_INVALID_READ; ++ goto out; ++ } ++ ++ subreq->source = source; ++ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); ++ ++ subreq->io_iter = *io_iter; ++ iov_iter_truncate(&subreq->io_iter, subreq->len); ++ iov_iter_advance(io_iter, subreq->len); ++out: ++ subreq->source = source; ++ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); ++ return source; ++} ++ ++/* ++ * Slice off a piece of a read request and submit an I/O request for it. ++ */ ++static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, ++ struct iov_iter *io_iter) ++{ ++ struct netfs_io_subrequest *subreq; ++ enum netfs_io_source source; ++ ++ subreq = netfs_alloc_subrequest(rreq); ++ if (!subreq) ++ return false; ++ ++ subreq->start = rreq->start + rreq->submitted; ++ subreq->len = io_iter->count; ++ ++ _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); ++ list_add_tail(&subreq->rreq_link, &rreq->subrequests); ++ ++ /* Call out to the cache to find out what it can do with the remaining ++ * subset. It tells us in subreq->flags what it decided should be done ++ * and adjusts subreq->len down if the subset crosses a cache boundary. ++ * ++ * Then when we hand the subset, it can choose to take a subset of that ++ * (the starts must coincide), in which case, we go around the loop ++ * again and ask it to download the next piece. ++ */ ++ source = netfs_rreq_prepare_read(rreq, subreq, io_iter); ++ if (source == NETFS_INVALID_READ) ++ goto subreq_failed; ++ ++ atomic_inc(&rreq->nr_outstanding); ++ ++ rreq->submitted += subreq->len; ++ ++ trace_netfs_sreq(subreq, netfs_sreq_trace_submit); ++ switch (source) { ++ case NETFS_FILL_WITH_ZEROES: ++ netfs_fill_with_zeroes(rreq, subreq); ++ break; ++ case NETFS_DOWNLOAD_FROM_SERVER: ++ netfs_read_from_server(rreq, subreq); ++ break; ++ case NETFS_READ_FROM_CACHE: ++ netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); ++ break; ++ default: ++ BUG(); ++ } ++ ++ return true; ++ ++subreq_failed: ++ rreq->error = subreq->error; ++ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed); ++ return false; ++} ++ ++/* ++ * Begin the process of reading in a chunk of data, where that data may be ++ * stitched together from multiple sources, including multiple servers and the ++ * local cache. ++ */ ++int netfs_begin_read(struct netfs_io_request *rreq, bool sync) ++{ ++ struct iov_iter io_iter; ++ int ret; ++ ++ _enter("R=%x %llx-%llx", ++ rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); ++ ++ if (rreq->len == 0) { ++ pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); ++ return -EIO; ++ } ++ ++ if (rreq->origin == NETFS_DIO_READ) ++ inode_dio_begin(rreq->inode); ++ ++ // TODO: Use bounce buffer if requested ++ rreq->io_iter = rreq->iter; ++ ++ INIT_WORK(&rreq->work, netfs_rreq_work); ++ ++ /* Chop the read into slices according to what the cache and the netfs ++ * want and submit each one. ++ */ ++ netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding); ++ atomic_set(&rreq->nr_outstanding, 1); ++ io_iter = rreq->io_iter; ++ do { ++ _debug("submit %llx + %llx >= %llx", ++ rreq->start, rreq->submitted, rreq->i_size); ++ if (!netfs_rreq_submit_slice(rreq, &io_iter)) ++ break; ++ if (test_bit(NETFS_SREQ_NO_PROGRESS, &rreq->flags)) ++ break; ++ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && ++ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) ++ break; ++ ++ } while (rreq->submitted < rreq->len); ++ ++ if (!rreq->submitted) { ++ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); ++ if (rreq->origin == NETFS_DIO_READ) ++ inode_dio_end(rreq->inode); ++ ret = 0; ++ goto out; ++ } ++ ++ if (sync) { ++ /* Keep nr_outstanding incremented so that the ref always ++ * belongs to us, and the service code isn't punted off to a ++ * random thread pool to process. Note that this might start ++ * further work, such as writing to the cache. ++ */ ++ wait_var_event(&rreq->nr_outstanding, ++ atomic_read(&rreq->nr_outstanding) == 1); ++ if (atomic_dec_and_test(&rreq->nr_outstanding)) ++ netfs_rreq_assess(rreq, false); ++ ++ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); ++ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, ++ TASK_UNINTERRUPTIBLE); ++ ++ ret = rreq->error; ++ if (ret == 0) { ++ if (rreq->origin == NETFS_DIO_READ) { ++ ret = rreq->transferred; ++ } else if (rreq->submitted < rreq->len) { ++ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); ++ ret = -EIO; ++ } ++ } ++ } else { ++ /* If we decrement nr_outstanding to 0, the ref belongs to us. */ ++ if (atomic_dec_and_test(&rreq->nr_outstanding)) ++ netfs_rreq_assess(rreq, false); ++ ret = -EIOCBQUEUED; ++ } ++ ++out: ++ return ret; ++} +diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c +index 72a435e5fc6d..b781bbbf1d8d 100644 +--- a/fs/netfs/iterator.c ++++ b/fs/netfs/iterator.c +@@ -188,59 +188,9 @@ static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offse + return min(span, max_size); + } + +-/* +- * Select the span of a folio queue iterator we're going to use. Limit it by +- * both maximum size and maximum number of segments. Returns the size of the +- * span in bytes. +- */ +-static size_t netfs_limit_folioq(const struct iov_iter *iter, size_t start_offset, +- size_t max_size, size_t max_segs) +-{ +- const struct folio_queue *folioq = iter->folioq; +- unsigned int nsegs = 0; +- unsigned int slot = iter->folioq_slot; +- size_t span = 0, n = iter->count; +- +- if (WARN_ON(!iov_iter_is_folioq(iter)) || +- WARN_ON(start_offset > n) || +- n == 0) +- return 0; +- max_size = umin(max_size, n - start_offset); +- +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = folioq->next; +- slot = 0; +- } +- +- start_offset += iter->iov_offset; +- do { +- size_t flen = folioq_folio_size(folioq, slot); +- +- if (start_offset < flen) { +- span += flen - start_offset; +- nsegs++; +- start_offset = 0; +- } else { +- start_offset -= flen; +- } +- if (span >= max_size || nsegs >= max_segs) +- break; +- +- slot++; +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = folioq->next; +- slot = 0; +- } +- } while (folioq); +- +- return umin(span, max_size); +-} +- + size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) + { +- if (iov_iter_is_folioq(iter)) +- return netfs_limit_folioq(iter, start_offset, max_size, max_segs); + if (iov_iter_is_bvec(iter)) + return netfs_limit_bvec(iter, start_offset, max_size, max_segs); + if (iov_iter_is_xarray(iter)) +diff --git a/fs/netfs/main.c b/fs/netfs/main.c +index 6c7be1377ee0..9d6b49dc6694 100644 +--- a/fs/netfs/main.c ++++ b/fs/netfs/main.c +@@ -36,14 +36,13 @@ DEFINE_SPINLOCK(netfs_proc_lock); + static const char *netfs_origins[nr__netfs_io_origin] = { + [NETFS_READAHEAD] = "RA", + [NETFS_READPAGE] = "RP", +- [NETFS_READ_GAPS] = "RG", + [NETFS_READ_FOR_WRITE] = "RW", +- [NETFS_DIO_READ] = "DR", ++ [NETFS_COPY_TO_CACHE] = "CC", + [NETFS_WRITEBACK] = "WB", + [NETFS_WRITETHROUGH] = "WT", + [NETFS_UNBUFFERED_WRITE] = "UW", ++ [NETFS_DIO_READ] = "DR", + [NETFS_DIO_WRITE] = "DW", +- [NETFS_PGPRIV2_COPY_TO_CACHE] = "2C", + }; + + /* +@@ -63,7 +62,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) + + rreq = list_entry(v, struct netfs_io_request, proc_link); + seq_printf(m, +- "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx", ++ "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx", + rreq->debug_id, + netfs_origins[rreq->origin], + refcount_read(&rreq->ref), +diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c +index 0ad0982ce0e2..c1f321cf5999 100644 +--- a/fs/netfs/misc.c ++++ b/fs/netfs/misc.c +@@ -8,100 +8,6 @@ + #include + #include "internal.h" + +-/* +- * Append a folio to the rolling queue. +- */ +-int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, +- bool needs_put) +-{ +- struct folio_queue *tail = rreq->buffer_tail; +- unsigned int slot, order = folio_order(folio); +- +- if (WARN_ON_ONCE(!rreq->buffer && tail) || +- WARN_ON_ONCE(rreq->buffer && !tail)) +- return -EIO; +- +- if (!tail || folioq_full(tail)) { +- tail = kmalloc(sizeof(*tail), GFP_NOFS); +- if (!tail) +- return -ENOMEM; +- netfs_stat(&netfs_n_folioq); +- folioq_init(tail); +- tail->prev = rreq->buffer_tail; +- if (tail->prev) +- tail->prev->next = tail; +- rreq->buffer_tail = tail; +- if (!rreq->buffer) { +- rreq->buffer = tail; +- iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0); +- } +- rreq->buffer_tail_slot = 0; +- } +- +- rreq->io_iter.count += PAGE_SIZE << order; +- +- slot = folioq_append(tail, folio); +- /* Store the counter after setting the slot. */ +- smp_store_release(&rreq->buffer_tail_slot, slot); +- return 0; +-} +- +-/* +- * Delete the head of a rolling queue. +- */ +-struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq) +-{ +- struct folio_queue *head = wreq->buffer, *next = head->next; +- +- if (next) +- next->prev = NULL; +- netfs_stat_d(&netfs_n_folioq); +- kfree(head); +- wreq->buffer = next; +- return next; +-} +- +-/* +- * Clear out a rolling queue. +- */ +-void netfs_clear_buffer(struct netfs_io_request *rreq) +-{ +- struct folio_queue *p; +- +- while ((p = rreq->buffer)) { +- rreq->buffer = p->next; +- for (int slot = 0; slot < folioq_nr_slots(p); slot++) { +- struct folio *folio = folioq_folio(p, slot); +- if (!folio) +- continue; +- if (folioq_is_marked(p, slot)) { +- trace_netfs_folio(folio, netfs_folio_trace_put); +- folio_put(folio); +- } +- } +- netfs_stat_d(&netfs_n_folioq); +- kfree(p); +- } +-} +- +-/* +- * Reset the subrequest iterator to refer just to the region remaining to be +- * read. The iterator may or may not have been advanced by socket ops or +- * extraction ops to an extent that may or may not match the amount actually +- * read. +- */ +-void netfs_reset_iter(struct netfs_io_subrequest *subreq) +-{ +- struct iov_iter *io_iter = &subreq->io_iter; +- size_t remain = subreq->len - subreq->transferred; +- +- if (io_iter->count > remain) +- iov_iter_advance(io_iter, io_iter->count - remain); +- else if (io_iter->count < remain) +- iov_iter_revert(io_iter, remain - io_iter->count); +- iov_iter_truncate(&subreq->io_iter, remain); +-} +- + /** + * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback + * @mapping: The mapping the folio belongs to. +diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c +index 31e388ec6e48..0294df70c3ff 100644 +--- a/fs/netfs/objects.c ++++ b/fs/netfs/objects.c +@@ -36,6 +36,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, + memset(rreq, 0, kmem_cache_size(cache)); + rreq->start = start; + rreq->len = len; ++ rreq->upper_len = len; + rreq->origin = origin; + rreq->netfs_ops = ctx->ops; + rreq->mapping = mapping; +@@ -43,23 +44,13 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, + rreq->i_size = i_size_read(inode); + rreq->debug_id = atomic_inc_return(&debug_ids); + rreq->wsize = INT_MAX; +- rreq->io_streams[0].sreq_max_len = ULONG_MAX; +- rreq->io_streams[0].sreq_max_segs = 0; + spin_lock_init(&rreq->lock); + INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); + INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); + INIT_LIST_HEAD(&rreq->subrequests); ++ INIT_WORK(&rreq->work, NULL); + refcount_set(&rreq->ref, 1); + +- if (origin == NETFS_READAHEAD || +- origin == NETFS_READPAGE || +- origin == NETFS_READ_GAPS || +- origin == NETFS_READ_FOR_WRITE || +- origin == NETFS_DIO_READ) +- INIT_WORK(&rreq->work, netfs_read_termination_worker); +- else +- INIT_WORK(&rreq->work, netfs_write_collection_worker); +- + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + if (file && file->f_flags & O_NONBLOCK) + __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); +@@ -143,7 +134,6 @@ static void netfs_free_request(struct work_struct *work) + } + kvfree(rreq->direct_bv); + } +- netfs_clear_buffer(rreq); + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); +@@ -165,7 +155,7 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, + if (was_async) { + rreq->work.func = netfs_free_request; + if (!queue_work(system_unbound_wq, &rreq->work)) +- WARN_ON(1); ++ BUG(); + } else { + netfs_free_request(&rreq->work); + } +diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c +deleted file mode 100644 +index b18c65ba5580..000000000000 +--- a/fs/netfs/read_collect.c ++++ /dev/null +@@ -1,544 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-only +-/* Network filesystem read subrequest result collection, assessment and +- * retrying. +- * +- * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. +- * Written by David Howells (dhowells@redhat.com) +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include "internal.h" +- +-/* +- * Clear the unread part of an I/O request. +- */ +-static void netfs_clear_unread(struct netfs_io_subrequest *subreq) +-{ +- netfs_reset_iter(subreq); +- WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter)); +- iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); +- if (subreq->start + subreq->transferred >= subreq->rreq->i_size) +- __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); +-} +- +-/* +- * Flush, mark and unlock a folio that's now completely read. If we want to +- * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it +- * dirty and let writeback handle it. +- */ +-static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, +- struct netfs_io_request *rreq, +- struct folio_queue *folioq, +- int slot) +-{ +- struct netfs_folio *finfo; +- struct folio *folio = folioq_folio(folioq, slot); +- +- flush_dcache_folio(folio); +- folio_mark_uptodate(folio); +- +- if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { +- finfo = netfs_folio_info(folio); +- if (finfo) { +- trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); +- if (finfo->netfs_group) +- folio_change_private(folio, finfo->netfs_group); +- else +- folio_detach_private(folio); +- kfree(finfo); +- } +- +- if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { +- if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) { +- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); +- folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); +- folio_mark_dirty(folio); +- } +- } else { +- trace_netfs_folio(folio, netfs_folio_trace_read_done); +- } +- } else { +- // TODO: Use of PG_private_2 is deprecated. +- if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) +- netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); +- } +- +- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { +- if (folio->index == rreq->no_unlock_folio && +- test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { +- _debug("no unlock"); +- } else { +- trace_netfs_folio(folio, netfs_folio_trace_read_unlock); +- folio_unlock(folio); +- } +- } +-} +- +-/* +- * Unlock any folios that are now completely read. Returns true if the +- * subrequest is removed from the list. +- */ +-static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async) +-{ +- struct netfs_io_subrequest *prev, *next; +- struct netfs_io_request *rreq = subreq->rreq; +- struct folio_queue *folioq = subreq->curr_folioq; +- size_t avail, prev_donated, next_donated, fsize, part, excess; +- loff_t fpos, start; +- loff_t fend; +- int slot = subreq->curr_folioq_slot; +- +- if (WARN(subreq->transferred > subreq->len, +- "Subreq overread: R%x[%x] %zu > %zu", +- rreq->debug_id, subreq->debug_index, +- subreq->transferred, subreq->len)) +- subreq->transferred = subreq->len; +- +-next_folio: +- fsize = PAGE_SIZE << subreq->curr_folio_order; +- fpos = round_down(subreq->start + subreq->consumed, fsize); +- fend = fpos + fsize; +- +- if (WARN_ON_ONCE(!folioq) || +- WARN_ON_ONCE(!folioq_folio(folioq, slot)) || +- WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) { +- pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n", +- rreq->debug_id, subreq->debug_index, +- subreq->start, subreq->start + subreq->transferred - 1, +- subreq->consumed, subreq->transferred, subreq->len, +- slot); +- if (folioq) { +- struct folio *folio = folioq_folio(folioq, slot); +- +- pr_err("folioq: orders=%02x%02x%02x%02x\n", +- folioq->orders[0], folioq->orders[1], +- folioq->orders[2], folioq->orders[3]); +- if (folio) +- pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n", +- fpos, fend - 1, folio_pos(folio), folio_order(folio), +- folioq_folio_order(folioq, slot)); +- } +- } +- +-donation_changed: +- /* Try to consume the current folio if we've hit or passed the end of +- * it. There's a possibility that this subreq doesn't start at the +- * beginning of the folio, in which case we need to donate to/from the +- * preceding subreq. +- * +- * We also need to include any potential donation back from the +- * following subreq. +- */ +- prev_donated = READ_ONCE(subreq->prev_donated); +- next_donated = READ_ONCE(subreq->next_donated); +- if (prev_donated || next_donated) { +- spin_lock_bh(&rreq->lock); +- prev_donated = subreq->prev_donated; +- next_donated = subreq->next_donated; +- subreq->start -= prev_donated; +- subreq->len += prev_donated; +- subreq->transferred += prev_donated; +- prev_donated = subreq->prev_donated = 0; +- if (subreq->transferred == subreq->len) { +- subreq->len += next_donated; +- subreq->transferred += next_donated; +- next_donated = subreq->next_donated = 0; +- } +- trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations); +- spin_unlock_bh(&rreq->lock); +- } +- +- avail = subreq->transferred; +- if (avail == subreq->len) +- avail += next_donated; +- start = subreq->start; +- if (subreq->consumed == 0) { +- start -= prev_donated; +- avail += prev_donated; +- } else { +- start += subreq->consumed; +- avail -= subreq->consumed; +- } +- part = umin(avail, fsize); +- +- trace_netfs_progress(subreq, start, avail, part); +- +- if (start + avail >= fend) { +- if (fpos == start) { +- /* Flush, unlock and mark for caching any folio we've just read. */ +- subreq->consumed = fend - subreq->start; +- netfs_unlock_read_folio(subreq, rreq, folioq, slot); +- folioq_mark2(folioq, slot); +- if (subreq->consumed >= subreq->len) +- goto remove_subreq; +- } else if (fpos < start) { +- excess = fend - subreq->start; +- +- spin_lock_bh(&rreq->lock); +- /* If we complete first on a folio split with the +- * preceding subreq, donate to that subreq - otherwise +- * we get the responsibility. +- */ +- if (subreq->prev_donated != prev_donated) { +- spin_unlock_bh(&rreq->lock); +- goto donation_changed; +- } +- +- if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) { +- spin_unlock_bh(&rreq->lock); +- pr_err("Can't donate prior to front\n"); +- goto bad; +- } +- +- prev = list_prev_entry(subreq, rreq_link); +- WRITE_ONCE(prev->next_donated, prev->next_donated + excess); +- subreq->start += excess; +- subreq->len -= excess; +- subreq->transferred -= excess; +- trace_netfs_donate(rreq, subreq, prev, excess, +- netfs_trace_donate_tail_to_prev); +- trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); +- +- if (subreq->consumed >= subreq->len) +- goto remove_subreq_locked; +- spin_unlock_bh(&rreq->lock); +- } else { +- pr_err("fpos > start\n"); +- goto bad; +- } +- +- /* Advance the rolling buffer to the next folio. */ +- slot++; +- if (slot >= folioq_nr_slots(folioq)) { +- slot = 0; +- folioq = folioq->next; +- subreq->curr_folioq = folioq; +- } +- subreq->curr_folioq_slot = slot; +- if (folioq && folioq_folio(folioq, slot)) +- subreq->curr_folio_order = folioq->orders[slot]; +- if (!was_async) +- cond_resched(); +- goto next_folio; +- } +- +- /* Deal with partial progress. */ +- if (subreq->transferred < subreq->len) +- return false; +- +- /* Donate the remaining downloaded data to one of the neighbouring +- * subrequests. Note that we may race with them doing the same thing. +- */ +- spin_lock_bh(&rreq->lock); +- +- if (subreq->prev_donated != prev_donated || +- subreq->next_donated != next_donated) { +- spin_unlock_bh(&rreq->lock); +- cond_resched(); +- goto donation_changed; +- } +- +- /* Deal with the trickiest case: that this subreq is in the middle of a +- * folio, not touching either edge, but finishes first. In such a +- * case, we donate to the previous subreq, if there is one, so that the +- * donation is only handled when that completes - and remove this +- * subreq from the list. +- * +- * If the previous subreq finished first, we will have acquired their +- * donation and should be able to unlock folios and/or donate nextwards. +- */ +- if (!subreq->consumed && +- !prev_donated && +- !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { +- prev = list_prev_entry(subreq, rreq_link); +- WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); +- subreq->start += subreq->len; +- subreq->len = 0; +- subreq->transferred = 0; +- trace_netfs_donate(rreq, subreq, prev, subreq->len, +- netfs_trace_donate_to_prev); +- trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); +- goto remove_subreq_locked; +- } +- +- /* If we can't donate down the chain, donate up the chain instead. */ +- excess = subreq->len - subreq->consumed + next_donated; +- +- if (!subreq->consumed) +- excess += prev_donated; +- +- if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) { +- rreq->prev_donated = excess; +- trace_netfs_donate(rreq, subreq, NULL, excess, +- netfs_trace_donate_to_deferred_next); +- } else { +- next = list_next_entry(subreq, rreq_link); +- WRITE_ONCE(next->prev_donated, excess); +- trace_netfs_donate(rreq, subreq, next, excess, +- netfs_trace_donate_to_next); +- } +- trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next); +- subreq->len = subreq->consumed; +- subreq->transferred = subreq->consumed; +- goto remove_subreq_locked; +- +-remove_subreq: +- spin_lock_bh(&rreq->lock); +-remove_subreq_locked: +- subreq->consumed = subreq->len; +- list_del(&subreq->rreq_link); +- spin_unlock_bh(&rreq->lock); +- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed); +- return true; +- +-bad: +- /* Errr... prev and next both donated to us, but insufficient to finish +- * the folio. +- */ +- printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n", +- rreq->debug_id, subreq->debug_index, +- subreq->start, subreq->start + subreq->transferred - 1, +- subreq->consumed, subreq->transferred, subreq->len); +- printk("folio: %llx-%llx\n", fpos, fend - 1); +- printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated); +- printk("s=%llx av=%zx part=%zx\n", start, avail, part); +- BUG(); +-} +- +-/* +- * Do page flushing and suchlike after DIO. +- */ +-static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) +-{ +- struct netfs_io_subrequest *subreq; +- unsigned int i; +- +- /* Collect unbuffered reads and direct reads, adding up the transfer +- * sizes until we find the first short or failed subrequest. +- */ +- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { +- rreq->transferred += subreq->transferred; +- +- if (subreq->transferred < subreq->len || +- test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { +- rreq->error = subreq->error; +- break; +- } +- } +- +- if (rreq->origin == NETFS_DIO_READ) { +- for (i = 0; i < rreq->direct_bv_count; i++) { +- flush_dcache_page(rreq->direct_bv[i].bv_page); +- // TODO: cifs marks pages in the destination buffer +- // dirty under some circumstances after a read. Do we +- // need to do that too? +- set_page_dirty(rreq->direct_bv[i].bv_page); +- } +- } +- +- if (rreq->iocb) { +- rreq->iocb->ki_pos += rreq->transferred; +- if (rreq->iocb->ki_complete) +- rreq->iocb->ki_complete( +- rreq->iocb, rreq->error ? rreq->error : rreq->transferred); +- } +- if (rreq->netfs_ops->done) +- rreq->netfs_ops->done(rreq); +- if (rreq->origin == NETFS_DIO_READ) +- inode_dio_end(rreq->inode); +-} +- +-/* +- * Assess the state of a read request and decide what to do next. +- * +- * Note that we're in normal kernel thread context at this point, possibly +- * running on a workqueue. +- */ +-static void netfs_rreq_assess(struct netfs_io_request *rreq) +-{ +- trace_netfs_rreq(rreq, netfs_rreq_trace_assess); +- +- //netfs_rreq_is_still_valid(rreq); +- +- if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) { +- netfs_retry_reads(rreq); +- return; +- } +- +- if (rreq->origin == NETFS_DIO_READ || +- rreq->origin == NETFS_READ_GAPS) +- netfs_rreq_assess_dio(rreq); +- task_io_account_read(rreq->transferred); +- +- trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); +- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); +- wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); +- +- trace_netfs_rreq(rreq, netfs_rreq_trace_done); +- netfs_clear_subrequests(rreq, false); +- netfs_unlock_abandoned_read_pages(rreq); +- if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))) +- netfs_pgpriv2_write_to_the_cache(rreq); +-} +- +-void netfs_read_termination_worker(struct work_struct *work) +-{ +- struct netfs_io_request *rreq = +- container_of(work, struct netfs_io_request, work); +- netfs_see_request(rreq, netfs_rreq_trace_see_work); +- netfs_rreq_assess(rreq); +- netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete); +-} +- +-/* +- * Handle the completion of all outstanding I/O operations on a read request. +- * We inherit a ref from the caller. +- */ +-void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async) +-{ +- if (!was_async) +- return netfs_rreq_assess(rreq); +- if (!work_pending(&rreq->work)) { +- netfs_get_request(rreq, netfs_rreq_trace_get_work); +- if (!queue_work(system_unbound_wq, &rreq->work)) +- netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq); +- } +-} +- +-/** +- * netfs_read_subreq_progress - Note progress of a read operation. +- * @subreq: The read request that has terminated. +- * @was_async: True if we're in an asynchronous context. +- * +- * This tells the read side of netfs lib that a contributory I/O operation has +- * made some progress and that it may be possible to unlock some folios. +- * +- * Before calling, the filesystem should update subreq->transferred to track +- * the amount of data copied into the output buffer. +- * +- * If @was_async is true, the caller might be running in softirq or interrupt +- * context and we can't sleep. +- */ +-void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, +- bool was_async) +-{ +- struct netfs_io_request *rreq = subreq->rreq; +- +- trace_netfs_sreq(subreq, netfs_sreq_trace_progress); +- +- if (subreq->transferred > subreq->consumed && +- (rreq->origin == NETFS_READAHEAD || +- rreq->origin == NETFS_READPAGE || +- rreq->origin == NETFS_READ_FOR_WRITE)) { +- netfs_consume_read_data(subreq, was_async); +- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); +- } +-} +-EXPORT_SYMBOL(netfs_read_subreq_progress); +- +-/** +- * netfs_read_subreq_terminated - Note the termination of an I/O operation. +- * @subreq: The I/O request that has terminated. +- * @error: Error code indicating type of completion. +- * @was_async: The termination was asynchronous +- * +- * This tells the read helper that a contributory I/O operation has terminated, +- * one way or another, and that it should integrate the results. +- * +- * The caller indicates the outcome of the operation through @error, supplying +- * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY +- * is set) or a negative error code. The helper will look after reissuing I/O +- * operations as appropriate and writing downloaded data to the cache. +- * +- * Before calling, the filesystem should update subreq->transferred to track +- * the amount of data copied into the output buffer. +- * +- * If @was_async is true, the caller might be running in softirq or interrupt +- * context and we can't sleep. +- */ +-void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, +- int error, bool was_async) +-{ +- struct netfs_io_request *rreq = subreq->rreq; +- +- switch (subreq->source) { +- case NETFS_READ_FROM_CACHE: +- netfs_stat(&netfs_n_rh_read_done); +- break; +- case NETFS_DOWNLOAD_FROM_SERVER: +- netfs_stat(&netfs_n_rh_download_done); +- break; +- default: +- break; +- } +- +- if (rreq->origin != NETFS_DIO_READ) { +- /* Collect buffered reads. +- * +- * If the read completed validly short, then we can clear the +- * tail before going on to unlock the folios. +- */ +- if (error == 0 && subreq->transferred < subreq->len && +- (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) || +- test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) { +- netfs_clear_unread(subreq); +- subreq->transferred = subreq->len; +- trace_netfs_sreq(subreq, netfs_sreq_trace_clear); +- } +- if (subreq->transferred > subreq->consumed && +- (rreq->origin == NETFS_READAHEAD || +- rreq->origin == NETFS_READPAGE || +- rreq->origin == NETFS_READ_FOR_WRITE)) { +- netfs_consume_read_data(subreq, was_async); +- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); +- } +- rreq->transferred += subreq->transferred; +- } +- +- /* Deal with retry requests, short reads and errors. If we retry +- * but don't make progress, we abandon the attempt. +- */ +- if (!error && subreq->transferred < subreq->len) { +- if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) { +- trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof); +- } else { +- trace_netfs_sreq(subreq, netfs_sreq_trace_short); +- if (subreq->transferred > subreq->consumed) { +- __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); +- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); +- set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); +- } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { +- __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); +- set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); +- } else { +- __set_bit(NETFS_SREQ_FAILED, &subreq->flags); +- error = -ENODATA; +- } +- } +- } +- +- subreq->error = error; +- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); +- +- if (unlikely(error < 0)) { +- trace_netfs_failure(rreq, subreq, error, netfs_fail_read); +- if (subreq->source == NETFS_READ_FROM_CACHE) { +- netfs_stat(&netfs_n_rh_read_failed); +- } else { +- netfs_stat(&netfs_n_rh_download_failed); +- set_bit(NETFS_RREQ_FAILED, &rreq->flags); +- rreq->error = subreq->error; +- } +- } +- +- if (atomic_dec_and_test(&rreq->nr_outstanding)) +- netfs_rreq_terminated(rreq, was_async); +- +- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +-} +-EXPORT_SYMBOL(netfs_read_subreq_terminated); +diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c +deleted file mode 100644 +index ba5af89d37fa..000000000000 +--- a/fs/netfs/read_pgpriv2.c ++++ /dev/null +@@ -1,264 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-only +-/* Read with PG_private_2 [DEPRECATED]. +- * +- * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. +- * Written by David Howells (dhowells@redhat.com) +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include "internal.h" +- +-/* +- * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2. The +- * third mark in the folio queue is used to indicate that this folio needs +- * writing. +- */ +-void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, +- struct netfs_io_request *rreq, +- struct folio_queue *folioq, +- int slot) +-{ +- struct folio *folio = folioq_folio(folioq, slot); +- +- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); +- folio_start_private_2(folio); +- folioq_mark3(folioq, slot); +-} +- +-/* +- * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an +- * unrecoverable error. +- */ +-static void netfs_pgpriv2_cancel(struct folio_queue *folioq) +-{ +- struct folio *folio; +- int slot; +- +- while (folioq) { +- if (!folioq->marks3) { +- folioq = folioq->next; +- continue; +- } +- +- slot = __ffs(folioq->marks3); +- folio = folioq_folio(folioq, slot); +- +- trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); +- folio_end_private_2(folio); +- folioq_unmark3(folioq, slot); +- } +-} +- +-/* +- * [DEPRECATED] Copy a folio to the cache with PG_private_2 set. +- */ +-static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio *folio) +-{ +- struct netfs_io_stream *cache = &wreq->io_streams[1]; +- size_t fsize = folio_size(folio), flen = fsize; +- loff_t fpos = folio_pos(folio), i_size; +- bool to_eof = false; +- +- _enter(""); +- +- /* netfs_perform_write() may shift i_size around the page or from out +- * of the page to beyond it, but cannot move i_size into or through the +- * page since we have it locked. +- */ +- i_size = i_size_read(wreq->inode); +- +- if (fpos >= i_size) { +- /* mmap beyond eof. */ +- _debug("beyond eof"); +- folio_end_private_2(folio); +- return 0; +- } +- +- if (fpos + fsize > wreq->i_size) +- wreq->i_size = i_size; +- +- if (flen > i_size - fpos) { +- flen = i_size - fpos; +- to_eof = true; +- } else if (flen == i_size - fpos) { +- to_eof = true; +- } +- +- _debug("folio %zx %zx", flen, fsize); +- +- trace_netfs_folio(folio, netfs_folio_trace_store_copy); +- +- /* Attach the folio to the rolling buffer. */ +- if (netfs_buffer_append_folio(wreq, folio, false) < 0) +- return -ENOMEM; +- +- cache->submit_extendable_to = fsize; +- cache->submit_off = 0; +- cache->submit_len = flen; +- +- /* Attach the folio to one or more subrequests. For a big folio, we +- * could end up with thousands of subrequests if the wsize is small - +- * but we might need to wait during the creation of subrequests for +- * network resources (eg. SMB credits). +- */ +- do { +- ssize_t part; +- +- wreq->io_iter.iov_offset = cache->submit_off; +- +- atomic64_set(&wreq->issued_to, fpos + cache->submit_off); +- cache->submit_extendable_to = fsize - cache->submit_off; +- part = netfs_advance_write(wreq, cache, fpos + cache->submit_off, +- cache->submit_len, to_eof); +- cache->submit_off += part; +- if (part > cache->submit_len) +- cache->submit_len = 0; +- else +- cache->submit_len -= part; +- } while (cache->submit_len > 0); +- +- wreq->io_iter.iov_offset = 0; +- iov_iter_advance(&wreq->io_iter, fsize); +- atomic64_set(&wreq->issued_to, fpos + fsize); +- +- if (flen < fsize) +- netfs_issue_write(wreq, cache); +- +- _leave(" = 0"); +- return 0; +-} +- +-/* +- * [DEPRECATED] Go through the buffer and write any folios that are marked with +- * the third mark to the cache. +- */ +-void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq) +-{ +- struct netfs_io_request *wreq; +- struct folio_queue *folioq; +- struct folio *folio; +- int error = 0; +- int slot = 0; +- +- _enter(""); +- +- if (!fscache_resources_valid(&rreq->cache_resources)) +- goto couldnt_start; +- +- /* Need the first folio to be able to set up the op. */ +- for (folioq = rreq->buffer; folioq; folioq = folioq->next) { +- if (folioq->marks3) { +- slot = __ffs(folioq->marks3); +- break; +- } +- } +- if (!folioq) +- return; +- folio = folioq_folio(folioq, slot); +- +- wreq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio), +- NETFS_PGPRIV2_COPY_TO_CACHE); +- if (IS_ERR(wreq)) { +- kleave(" [create %ld]", PTR_ERR(wreq)); +- goto couldnt_start; +- } +- +- trace_netfs_write(wreq, netfs_write_trace_copy_to_cache); +- netfs_stat(&netfs_n_wh_copy_to_cache); +- +- for (;;) { +- error = netfs_pgpriv2_copy_folio(wreq, folio); +- if (error < 0) +- break; +- +- folioq_unmark3(folioq, slot); +- if (!folioq->marks3) { +- folioq = folioq->next; +- if (!folioq) +- break; +- } +- +- slot = __ffs(folioq->marks3); +- folio = folioq_folio(folioq, slot); +- } +- +- netfs_issue_write(wreq, &wreq->io_streams[1]); +- smp_wmb(); /* Write lists before ALL_QUEUED. */ +- set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); +- +- netfs_put_request(wreq, false, netfs_rreq_trace_put_return); +- _leave(" = %d", error); +-couldnt_start: +- netfs_pgpriv2_cancel(rreq->buffer); +-} +- +-/* +- * [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished +- * copying. +- */ +-bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) +-{ +- struct folio_queue *folioq = wreq->buffer; +- unsigned long long collected_to = wreq->collected_to; +- unsigned int slot = wreq->buffer_head_slot; +- bool made_progress = false; +- +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = netfs_delete_buffer_head(wreq); +- slot = 0; +- } +- +- for (;;) { +- struct folio *folio; +- unsigned long long fpos, fend; +- size_t fsize, flen; +- +- folio = folioq_folio(folioq, slot); +- if (WARN_ONCE(!folio_test_private_2(folio), +- "R=%08x: folio %lx is not marked private_2\n", +- wreq->debug_id, folio->index)) +- trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); +- +- fpos = folio_pos(folio); +- fsize = folio_size(folio); +- flen = fsize; +- +- fend = min_t(unsigned long long, fpos + flen, wreq->i_size); +- +- trace_netfs_collect_folio(wreq, folio, fend, collected_to); +- +- /* Unlock any folio we've transferred all of. */ +- if (collected_to < fend) +- break; +- +- trace_netfs_folio(folio, netfs_folio_trace_end_copy); +- folio_end_private_2(folio); +- wreq->cleaned_to = fpos + fsize; +- made_progress = true; +- +- /* Clean up the head folioq. If we clear an entire folioq, then +- * we can get rid of it provided it's not also the tail folioq +- * being filled by the issuer. +- */ +- folioq_clear(folioq, slot); +- slot++; +- if (slot >= folioq_nr_slots(folioq)) { +- if (READ_ONCE(wreq->buffer_tail) == folioq) +- break; +- folioq = netfs_delete_buffer_head(wreq); +- slot = 0; +- } +- +- if (fpos + fsize >= collected_to) +- break; +- } +- +- wreq->buffer = folioq; +- wreq->buffer_head_slot = slot; +- return made_progress; +-} +diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c +deleted file mode 100644 +index 0350592ea804..000000000000 +--- a/fs/netfs/read_retry.c ++++ /dev/null +@@ -1,256 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-only +-/* Network filesystem read subrequest retrying. +- * +- * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. +- * Written by David Howells (dhowells@redhat.com) +- */ +- +-#include +-#include +-#include "internal.h" +- +-static void netfs_reissue_read(struct netfs_io_request *rreq, +- struct netfs_io_subrequest *subreq) +-{ +- struct iov_iter *io_iter = &subreq->io_iter; +- +- if (iov_iter_is_folioq(io_iter)) { +- subreq->curr_folioq = (struct folio_queue *)io_iter->folioq; +- subreq->curr_folioq_slot = io_iter->folioq_slot; +- subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; +- } +- +- atomic_inc(&rreq->nr_outstanding); +- __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); +- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); +- subreq->rreq->netfs_ops->issue_read(subreq); +-} +- +-/* +- * Go through the list of failed/short reads, retrying all retryable ones. We +- * need to switch failed cache reads to network downloads. +- */ +-static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) +-{ +- struct netfs_io_subrequest *subreq; +- struct netfs_io_stream *stream0 = &rreq->io_streams[0]; +- LIST_HEAD(sublist); +- LIST_HEAD(queue); +- +- _enter("R=%x", rreq->debug_id); +- +- if (list_empty(&rreq->subrequests)) +- return; +- +- if (rreq->netfs_ops->retry_request) +- rreq->netfs_ops->retry_request(rreq, NULL); +- +- /* If there's no renegotiation to do, just resend each retryable subreq +- * up to the first permanently failed one. +- */ +- if (!rreq->netfs_ops->prepare_read && +- !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { +- struct netfs_io_subrequest *subreq; +- +- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { +- if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) +- break; +- if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { +- netfs_reset_iter(subreq); +- netfs_reissue_read(rreq, subreq); +- } +- } +- return; +- } +- +- /* Okay, we need to renegotiate all the download requests and flip any +- * failed cache reads over to being download requests and negotiate +- * those also. All fully successful subreqs have been removed from the +- * list and any spare data from those has been donated. +- * +- * What we do is decant the list and rebuild it one subreq at a time so +- * that we don't end up with donations jumping over a gap we're busy +- * populating with smaller subrequests. In the event that the subreq +- * we just launched finishes before we insert the next subreq, it'll +- * fill in rreq->prev_donated instead. +- +- * Note: Alternatively, we could split the tail subrequest right before +- * we reissue it and fix up the donations under lock. +- */ +- list_splice_init(&rreq->subrequests, &queue); +- +- do { +- struct netfs_io_subrequest *from; +- struct iov_iter source; +- unsigned long long start, len; +- size_t part, deferred_next_donated = 0; +- bool boundary = false; +- +- /* Go through the subreqs and find the next span of contiguous +- * buffer that we then rejig (cifs, for example, needs the +- * rsize renegotiating) and reissue. +- */ +- from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link); +- list_move_tail(&from->rreq_link, &sublist); +- start = from->start + from->transferred; +- len = from->len - from->transferred; +- +- _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx", +- rreq->debug_id, from->debug_index, +- from->start, from->consumed, from->transferred, from->len); +- +- if (test_bit(NETFS_SREQ_FAILED, &from->flags) || +- !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) +- goto abandon; +- +- deferred_next_donated = from->next_donated; +- while ((subreq = list_first_entry_or_null( +- &queue, struct netfs_io_subrequest, rreq_link))) { +- if (subreq->start != start + len || +- subreq->transferred > 0 || +- !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) +- break; +- list_move_tail(&subreq->rreq_link, &sublist); +- len += subreq->len; +- deferred_next_donated = subreq->next_donated; +- if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags)) +- break; +- } +- +- _debug(" - range: %llx-%llx %llx", start, start + len - 1, len); +- +- /* Determine the set of buffers we're going to use. Each +- * subreq gets a subset of a single overall contiguous buffer. +- */ +- netfs_reset_iter(from); +- source = from->io_iter; +- source.count = len; +- +- /* Work through the sublist. */ +- while ((subreq = list_first_entry_or_null( +- &sublist, struct netfs_io_subrequest, rreq_link))) { +- list_del(&subreq->rreq_link); +- +- subreq->source = NETFS_DOWNLOAD_FROM_SERVER; +- subreq->start = start - subreq->transferred; +- subreq->len = len + subreq->transferred; +- stream0->sreq_max_len = subreq->len; +- +- __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); +- __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); +- +- spin_lock_bh(&rreq->lock); +- list_add_tail(&subreq->rreq_link, &rreq->subrequests); +- subreq->prev_donated += rreq->prev_donated; +- rreq->prev_donated = 0; +- trace_netfs_sreq(subreq, netfs_sreq_trace_retry); +- spin_unlock_bh(&rreq->lock); +- +- BUG_ON(!len); +- +- /* Renegotiate max_len (rsize) */ +- if (rreq->netfs_ops->prepare_read(subreq) < 0) { +- trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); +- __set_bit(NETFS_SREQ_FAILED, &subreq->flags); +- } +- +- part = umin(len, stream0->sreq_max_len); +- if (unlikely(rreq->io_streams[0].sreq_max_segs)) +- part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs); +- subreq->len = subreq->transferred + part; +- subreq->io_iter = source; +- iov_iter_truncate(&subreq->io_iter, part); +- iov_iter_advance(&source, part); +- len -= part; +- start += part; +- if (!len) { +- if (boundary) +- __set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); +- subreq->next_donated = deferred_next_donated; +- } else { +- __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); +- subreq->next_donated = 0; +- } +- +- netfs_reissue_read(rreq, subreq); +- if (!len) +- break; +- +- /* If we ran out of subrequests, allocate another. */ +- if (list_empty(&sublist)) { +- subreq = netfs_alloc_subrequest(rreq); +- if (!subreq) +- goto abandon; +- subreq->source = NETFS_DOWNLOAD_FROM_SERVER; +- subreq->start = start; +- +- /* We get two refs, but need just one. */ +- netfs_put_subrequest(subreq, false, netfs_sreq_trace_new); +- trace_netfs_sreq(subreq, netfs_sreq_trace_split); +- list_add_tail(&subreq->rreq_link, &sublist); +- } +- } +- +- /* If we managed to use fewer subreqs, we can discard the +- * excess. +- */ +- while ((subreq = list_first_entry_or_null( +- &sublist, struct netfs_io_subrequest, rreq_link))) { +- trace_netfs_sreq(subreq, netfs_sreq_trace_discard); +- list_del(&subreq->rreq_link); +- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); +- } +- +- } while (!list_empty(&queue)); +- +- return; +- +- /* If we hit ENOMEM, fail all remaining subrequests */ +-abandon: +- list_splice_init(&sublist, &queue); +- list_for_each_entry(subreq, &queue, rreq_link) { +- if (!subreq->error) +- subreq->error = -ENOMEM; +- __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); +- __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); +- __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); +- } +- spin_lock_bh(&rreq->lock); +- list_splice_tail_init(&queue, &rreq->subrequests); +- spin_unlock_bh(&rreq->lock); +-} +- +-/* +- * Retry reads. +- */ +-void netfs_retry_reads(struct netfs_io_request *rreq) +-{ +- trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); +- +- atomic_inc(&rreq->nr_outstanding); +- +- netfs_retry_read_subrequests(rreq); +- +- if (atomic_dec_and_test(&rreq->nr_outstanding)) +- netfs_rreq_terminated(rreq, false); +-} +- +-/* +- * Unlock any the pages that haven't been unlocked yet due to abandoned +- * subrequests. +- */ +-void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq) +-{ +- struct folio_queue *p; +- +- for (p = rreq->buffer; p; p = p->next) { +- for (int slot = 0; slot < folioq_count(p); slot++) { +- struct folio *folio = folioq_folio(p, slot); +- +- if (folio && !folioq_is_marked2(p, slot)) { +- trace_netfs_folio(folio, netfs_folio_trace_abandon); +- folio_unlock(folio); +- } +- } +- } +-} +diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c +index 8e63516b40f6..0892768eea32 100644 +--- a/fs/netfs/stats.c ++++ b/fs/netfs/stats.c +@@ -32,7 +32,6 @@ atomic_t netfs_n_wh_buffered_write; + atomic_t netfs_n_wh_writethrough; + atomic_t netfs_n_wh_dio_write; + atomic_t netfs_n_wh_writepages; +-atomic_t netfs_n_wh_copy_to_cache; + atomic_t netfs_n_wh_wstream_conflict; + atomic_t netfs_n_wh_upload; + atomic_t netfs_n_wh_upload_done; +@@ -40,53 +39,45 @@ atomic_t netfs_n_wh_upload_failed; + atomic_t netfs_n_wh_write; + atomic_t netfs_n_wh_write_done; + atomic_t netfs_n_wh_write_failed; +-atomic_t netfs_n_wb_lock_skip; +-atomic_t netfs_n_wb_lock_wait; +-atomic_t netfs_n_folioq; + + int netfs_stats_show(struct seq_file *m, void *v) + { +- seq_printf(m, "Reads : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", ++ seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", + atomic_read(&netfs_n_rh_dio_read), + atomic_read(&netfs_n_rh_readahead), + atomic_read(&netfs_n_rh_read_folio), + atomic_read(&netfs_n_rh_write_begin), + atomic_read(&netfs_n_rh_write_zskip)); +- seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n", ++ seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n", + atomic_read(&netfs_n_wh_buffered_write), + atomic_read(&netfs_n_wh_writethrough), + atomic_read(&netfs_n_wh_dio_write), +- atomic_read(&netfs_n_wh_writepages), +- atomic_read(&netfs_n_wh_copy_to_cache)); +- seq_printf(m, "ZeroOps: ZR=%u sh=%u sk=%u\n", ++ atomic_read(&netfs_n_wh_writepages)); ++ seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", + atomic_read(&netfs_n_rh_zero), + atomic_read(&netfs_n_rh_short_read), + atomic_read(&netfs_n_rh_write_zskip)); +- seq_printf(m, "DownOps: DL=%u ds=%u df=%u di=%u\n", ++ seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n", + atomic_read(&netfs_n_rh_download), + atomic_read(&netfs_n_rh_download_done), + atomic_read(&netfs_n_rh_download_failed), + atomic_read(&netfs_n_rh_download_instead)); +- seq_printf(m, "CaRdOps: RD=%u rs=%u rf=%u\n", ++ seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n", + atomic_read(&netfs_n_rh_read), + atomic_read(&netfs_n_rh_read_done), + atomic_read(&netfs_n_rh_read_failed)); +- seq_printf(m, "UpldOps: UL=%u us=%u uf=%u\n", ++ seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n", + atomic_read(&netfs_n_wh_upload), + atomic_read(&netfs_n_wh_upload_done), + atomic_read(&netfs_n_wh_upload_failed)); +- seq_printf(m, "CaWrOps: WR=%u ws=%u wf=%u\n", ++ seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", + atomic_read(&netfs_n_wh_write), + atomic_read(&netfs_n_wh_write_done), + atomic_read(&netfs_n_wh_write_failed)); +- seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", ++ seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n", + atomic_read(&netfs_n_rh_rreq), + atomic_read(&netfs_n_rh_sreq), +- atomic_read(&netfs_n_folioq), + atomic_read(&netfs_n_wh_wstream_conflict)); +- seq_printf(m, "WbLock : skip=%u wait=%u\n", +- atomic_read(&netfs_n_wb_lock_skip), +- atomic_read(&netfs_n_wb_lock_wait)); + return fscache_stats_show(m); + } + EXPORT_SYMBOL(netfs_stats_show); +diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c +index 1d438be2e1b4..ae7a2043f670 100644 +--- a/fs/netfs/write_collect.c ++++ b/fs/netfs/write_collect.c +@@ -15,11 +15,15 @@ + + /* Notes made in the collector */ + #define HIT_PENDING 0x01 /* A front op was still pending */ +-#define NEED_REASSESS 0x02 /* Need to loop round and reassess */ +-#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */ +-#define BUFFERED 0x08 /* The pagecache needs cleaning up */ +-#define NEED_RETRY 0x10 /* A front op requests retrying */ +-#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */ ++#define SOME_EMPTY 0x02 /* One of more streams are empty */ ++#define ALL_EMPTY 0x04 /* All streams are empty */ ++#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */ ++#define NEED_REASSESS 0x10 /* Need to loop round and reassess */ ++#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */ ++#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */ ++#define BUFFERED 0x80 /* The pagecache needs cleaning up */ ++#define NEED_RETRY 0x100 /* A front op requests retrying */ ++#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */ + + /* + * Successful completion of write of a folio to the server and/or cache. Note +@@ -78,37 +82,55 @@ int netfs_folio_written_back(struct folio *folio) + } + + /* +- * Unlock any folios we've finished with. ++ * Get hold of a folio we have under writeback. We don't want to get the ++ * refcount on it. + */ +-static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, +- unsigned int *notes) ++static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos) + { +- struct folio_queue *folioq = wreq->buffer; +- unsigned long long collected_to = wreq->collected_to; +- unsigned int slot = wreq->buffer_head_slot; ++ XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE); ++ struct folio *folio; + +- if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) { +- if (netfs_pgpriv2_unlock_copied_folios(wreq)) +- *notes |= MADE_PROGRESS; +- return; ++ rcu_read_lock(); ++ ++ for (;;) { ++ xas_reset(&xas); ++ folio = xas_load(&xas); ++ if (xas_retry(&xas, folio)) ++ continue; ++ ++ if (!folio || xa_is_value(folio)) ++ kdebug("R=%08x: folio %lx (%llx) not present", ++ wreq->debug_id, xas.xa_index, pos / PAGE_SIZE); ++ BUG_ON(!folio || xa_is_value(folio)); ++ ++ if (folio == xas_reload(&xas)) ++ break; + } + +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = netfs_delete_buffer_head(wreq); +- slot = 0; ++ rcu_read_unlock(); ++ ++ if (WARN_ONCE(!folio_test_writeback(folio), ++ "R=%08x: folio %lx is not under writeback\n", ++ wreq->debug_id, folio->index)) { ++ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); + } ++ return folio; ++} + ++/* ++ * Unlock any folios we've finished with. ++ */ ++static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, ++ unsigned long long collected_to, ++ unsigned int *notes) ++{ + for (;;) { + struct folio *folio; + struct netfs_folio *finfo; + unsigned long long fpos, fend; + size_t fsize, flen; + +- folio = folioq_folio(folioq, slot); +- if (WARN_ONCE(!folio_test_writeback(folio), +- "R=%08x: folio %lx is not under writeback\n", +- wreq->debug_id, folio->index)) +- trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); ++ folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to); + + fpos = folio_pos(folio); + fsize = folio_size(folio); +@@ -119,6 +141,12 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, + + trace_netfs_collect_folio(wreq, folio, fend, collected_to); + ++ if (fpos + fsize > wreq->contiguity) { ++ trace_netfs_collect_contig(wreq, fpos + fsize, ++ netfs_contig_trace_unlock); ++ wreq->contiguity = fpos + fsize; ++ } ++ + /* Unlock any folio we've transferred all of. */ + if (collected_to < fend) + break; +@@ -127,25 +155,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, + wreq->cleaned_to = fpos + fsize; + *notes |= MADE_PROGRESS; + +- /* Clean up the head folioq. If we clear an entire folioq, then +- * we can get rid of it provided it's not also the tail folioq +- * being filled by the issuer. +- */ +- folioq_clear(folioq, slot); +- slot++; +- if (slot >= folioq_nr_slots(folioq)) { +- if (READ_ONCE(wreq->buffer_tail) == folioq) +- break; +- folioq = netfs_delete_buffer_head(wreq); +- slot = 0; +- } +- + if (fpos + fsize >= collected_to) + break; + } +- +- wreq->buffer = folioq; +- wreq->buffer_head_slot = slot; + } + + /* +@@ -176,12 +188,9 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + break; + if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { +- struct iov_iter source = subreq->io_iter; +- +- iov_iter_revert(&source, subreq->len - source.count); + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); +- netfs_reissue_write(stream, subreq, &source); ++ netfs_reissue_write(stream, subreq); + } + } + return; +@@ -191,7 +200,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + + do { + struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp; +- struct iov_iter source; + unsigned long long start, len; + size_t part; + bool boundary = false; +@@ -219,13 +227,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + len += to->len; + } + +- /* Determine the set of buffers we're going to use. Each +- * subreq gets a subset of a single overall contiguous buffer. +- */ +- netfs_reset_iter(from); +- source = from->io_iter; +- source.count = len; +- + /* Work through the sublist. */ + subreq = from; + list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { +@@ -237,7 +238,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + stream->prepare_write(subreq); + +- part = min(len, stream->sreq_max_len); ++ part = min(len, subreq->max_len); + subreq->len = part; + subreq->start = start; + subreq->transferred = 0; +@@ -248,7 +249,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + boundary = true; + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); +- netfs_reissue_write(stream, subreq, &source); ++ netfs_reissue_write(stream, subreq); + if (subreq == to) + break; + } +@@ -277,6 +278,8 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + subreq = netfs_alloc_subrequest(wreq); + subreq->source = to->source; + subreq->start = start; ++ subreq->max_len = len; ++ subreq->max_nr_segs = INT_MAX; + subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); + subreq->stream_nr = to->stream_nr; + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); +@@ -290,12 +293,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + to = list_next_entry(to, rreq_link); + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + +- stream->sreq_max_len = len; +- stream->sreq_max_segs = INT_MAX; + switch (stream->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); +- stream->sreq_max_len = umin(len, wreq->wsize); ++ subreq->max_len = min(len, wreq->wsize); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); +@@ -306,7 +307,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + + stream->prepare_write(subreq); + +- part = umin(len, stream->sreq_max_len); ++ part = min(len, subreq->max_len); + subreq->len = subreq->transferred + part; + len -= part; + start += part; +@@ -315,7 +316,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, + boundary = false; + } + +- netfs_reissue_write(stream, subreq, &source); ++ netfs_reissue_write(stream, subreq); + if (!len) + break; + +@@ -376,7 +377,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) + { + struct netfs_io_subrequest *front, *remove; + struct netfs_io_stream *stream; +- unsigned long long collected_to, issued_to; ++ unsigned long long collected_to; + unsigned int notes; + int s; + +@@ -385,22 +386,28 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) + trace_netfs_rreq(wreq, netfs_rreq_trace_collect); + + reassess_streams: +- issued_to = atomic64_read(&wreq->issued_to); + smp_rmb(); + collected_to = ULLONG_MAX; +- if (wreq->origin == NETFS_WRITEBACK || +- wreq->origin == NETFS_WRITETHROUGH || +- wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) +- notes = BUFFERED; ++ if (wreq->origin == NETFS_WRITEBACK) ++ notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG; ++ else if (wreq->origin == NETFS_WRITETHROUGH) ++ notes = ALL_EMPTY | BUFFERED; + else +- notes = 0; ++ notes = ALL_EMPTY; + + /* Remove completed subrequests from the front of the streams and + * advance the completion point on each stream. We stop when we hit + * something that's in progress. The issuer thread may be adding stuff + * to the tail whilst we're doing this. ++ * ++ * We must not, however, merge in discontiguities that span whole ++ * folios that aren't under writeback. This is made more complicated ++ * by the folios in the gap being of unpredictable sizes - if they even ++ * exist - but we don't want to look them up. + */ + for (s = 0; s < NR_IO_STREAMS; s++) { ++ loff_t rstart, rend; ++ + stream = &wreq->io_streams[s]; + /* Read active flag before list pointers */ + if (!smp_load_acquire(&stream->active)) +@@ -412,10 +419,26 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) + //_debug("sreq [%x] %llx %zx/%zx", + // front->debug_index, front->start, front->transferred, front->len); + +- if (stream->collected_to < front->start) { +- trace_netfs_collect_gap(wreq, stream, issued_to, 'F'); +- stream->collected_to = front->start; ++ /* Stall if there may be a discontinuity. */ ++ rstart = round_down(front->start, PAGE_SIZE); ++ if (rstart > wreq->contiguity) { ++ if (wreq->contiguity > stream->collected_to) { ++ trace_netfs_collect_gap(wreq, stream, ++ wreq->contiguity, 'D'); ++ stream->collected_to = wreq->contiguity; ++ } ++ notes |= REASSESS_DISCONTIG; ++ break; ++ } ++ rend = round_up(front->start + front->len, PAGE_SIZE); ++ if (rend > wreq->contiguity) { ++ trace_netfs_collect_contig(wreq, rend, ++ netfs_contig_trace_collect); ++ wreq->contiguity = rend; ++ if (notes & REASSESS_DISCONTIG) ++ notes |= NEED_REASSESS; + } ++ notes &= ~MAYBE_DISCONTIG; + + /* Stall if the front is still undergoing I/O. */ + if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { +@@ -450,27 +473,33 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) + + cancel: + /* Remove if completely consumed. */ +- spin_lock_bh(&wreq->lock); ++ spin_lock(&wreq->lock); + + remove = front; + list_del_init(&front->rreq_link); + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + stream->front = front; +- spin_unlock_bh(&wreq->lock); ++ if (!front) { ++ unsigned long long jump_to = atomic64_read(&wreq->issued_to); ++ ++ if (stream->collected_to < jump_to) { ++ trace_netfs_collect_gap(wreq, stream, jump_to, 'A'); ++ stream->collected_to = jump_to; ++ } ++ } ++ ++ spin_unlock(&wreq->lock); + netfs_put_subrequest(remove, false, + notes & SAW_FAILURE ? + netfs_sreq_trace_put_cancel : + netfs_sreq_trace_put_done); + } + +- /* If we have an empty stream, we need to jump it forward +- * otherwise the collection point will never advance. +- */ +- if (!front && issued_to > stream->collected_to) { +- trace_netfs_collect_gap(wreq, stream, issued_to, 'E'); +- stream->collected_to = issued_to; +- } ++ if (front) ++ notes &= ~ALL_EMPTY; ++ else ++ notes |= SOME_EMPTY; + + if (stream->collected_to < collected_to) + collected_to = stream->collected_to; +@@ -479,6 +508,36 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) + if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to) + wreq->collected_to = collected_to; + ++ /* If we have an empty stream, we need to jump it forward over any gap ++ * otherwise the collection point will never advance. ++ * ++ * Note that the issuer always adds to the stream with the lowest ++ * so-far submitted start, so if we see two consecutive subreqs in one ++ * stream with nothing between then in another stream, then the second ++ * stream has a gap that can be jumped. ++ */ ++ if (notes & SOME_EMPTY) { ++ unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); ++ ++ for (s = 0; s < NR_IO_STREAMS; s++) { ++ stream = &wreq->io_streams[s]; ++ if (stream->active && ++ stream->front && ++ stream->front->start < jump_to) ++ jump_to = stream->front->start; ++ } ++ ++ for (s = 0; s < NR_IO_STREAMS; s++) { ++ stream = &wreq->io_streams[s]; ++ if (stream->active && ++ !stream->front && ++ stream->collected_to < jump_to) { ++ trace_netfs_collect_gap(wreq, stream, jump_to, 'B'); ++ stream->collected_to = jump_to; ++ } ++ } ++ } ++ + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active) +@@ -489,14 +548,43 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) + + /* Unlock any folios that we have now finished with. */ + if (notes & BUFFERED) { +- if (wreq->cleaned_to < wreq->collected_to) +- netfs_writeback_unlock_folios(wreq, ¬es); ++ unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity); ++ ++ if (wreq->cleaned_to < clean_to) ++ netfs_writeback_unlock_folios(wreq, clean_to, ¬es); + } else { + wreq->cleaned_to = wreq->collected_to; + } + + // TODO: Discard encryption buffers + ++ /* If all streams are discontiguous with the last folio we cleared, we ++ * may need to skip a set of folios. ++ */ ++ if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) { ++ unsigned long long jump_to = ULLONG_MAX; ++ ++ for (s = 0; s < NR_IO_STREAMS; s++) { ++ stream = &wreq->io_streams[s]; ++ if (stream->active && stream->front && ++ stream->front->start < jump_to) ++ jump_to = stream->front->start; ++ } ++ ++ trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump); ++ wreq->contiguity = jump_to; ++ wreq->cleaned_to = jump_to; ++ wreq->collected_to = jump_to; ++ for (s = 0; s < NR_IO_STREAMS; s++) { ++ stream = &wreq->io_streams[s]; ++ if (stream->collected_to < jump_to) ++ stream->collected_to = jump_to; ++ } ++ //cond_resched(); ++ notes |= MADE_PROGRESS; ++ goto reassess_streams; ++ } ++ + if (notes & NEED_RETRY) + goto need_retry; + if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { +diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c +index 04e66d587f77..3f7e37e50c7d 100644 +--- a/fs/netfs/write_issue.c ++++ b/fs/netfs/write_issue.c +@@ -95,8 +95,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, + struct netfs_io_request *wreq; + struct netfs_inode *ictx; + bool is_buffered = (origin == NETFS_WRITEBACK || +- origin == NETFS_WRITETHROUGH || +- origin == NETFS_PGPRIV2_COPY_TO_CACHE); ++ origin == NETFS_WRITETHROUGH); + + wreq = netfs_alloc_request(mapping, file, start, 0, origin); + if (IS_ERR(wreq)) +@@ -108,7 +107,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, + if (is_buffered && netfs_is_cache_enabled(ictx)) + fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); + ++ wreq->contiguity = wreq->start; + wreq->cleaned_to = wreq->start; ++ INIT_WORK(&wreq->work, netfs_write_collection_worker); + + wreq->io_streams[0].stream_nr = 0; + wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER; +@@ -157,19 +158,22 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, + subreq = netfs_alloc_subrequest(wreq); + subreq->source = stream->source; + subreq->start = start; ++ subreq->max_len = ULONG_MAX; ++ subreq->max_nr_segs = INT_MAX; + subreq->stream_nr = stream->stream_nr; +- subreq->io_iter = wreq->io_iter; + + _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); + ++ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, ++ refcount_read(&subreq->ref), ++ netfs_sreq_trace_new); ++ + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + +- stream->sreq_max_len = UINT_MAX; +- stream->sreq_max_segs = INT_MAX; + switch (stream->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); +- stream->sreq_max_len = wreq->wsize; ++ subreq->max_len = wreq->wsize; + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); +@@ -188,7 +192,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, + * the list. The collector only goes nextwards and uses the lock to + * remove entries off of the front. + */ +- spin_lock_bh(&wreq->lock); ++ spin_lock(&wreq->lock); + list_add_tail(&subreq->rreq_link, &stream->subrequests); + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { + stream->front = subreq; +@@ -199,7 +203,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, + } + } + +- spin_unlock_bh(&wreq->lock); ++ spin_unlock(&wreq->lock); + + stream->construct = subreq; + } +@@ -219,34 +223,41 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + return netfs_write_subrequest_terminated(subreq, subreq->error, false); + ++ // TODO: Use encrypted buffer ++ if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) { ++ subreq->io_iter = wreq->io_iter; ++ iov_iter_advance(&subreq->io_iter, ++ subreq->start + subreq->transferred - wreq->start); ++ iov_iter_truncate(&subreq->io_iter, ++ subreq->len - subreq->transferred); ++ } else { ++ iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages, ++ subreq->start + subreq->transferred, ++ subreq->len - subreq->transferred); ++ } ++ + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + stream->issue_write(subreq); + } + + void netfs_reissue_write(struct netfs_io_stream *stream, +- struct netfs_io_subrequest *subreq, +- struct iov_iter *source) ++ struct netfs_io_subrequest *subreq) + { +- size_t size = subreq->len - subreq->transferred; +- +- // TODO: Use encrypted buffer +- subreq->io_iter = *source; +- iov_iter_advance(source, size); +- iov_iter_truncate(&subreq->io_iter, size); +- + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_do_issue_write(stream, subreq); + } + +-void netfs_issue_write(struct netfs_io_request *wreq, +- struct netfs_io_stream *stream) ++static void netfs_issue_write(struct netfs_io_request *wreq, ++ struct netfs_io_stream *stream) + { + struct netfs_io_subrequest *subreq = stream->construct; + + if (!subreq) + return; + stream->construct = NULL; +- subreq->io_iter.count = subreq->len; ++ ++ if (subreq->start + subreq->len > wreq->start + wreq->submitted) ++ WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); + netfs_do_issue_write(stream, subreq); + } + +@@ -279,14 +290,13 @@ int netfs_advance_write(struct netfs_io_request *wreq, + netfs_prepare_write(wreq, stream, start); + subreq = stream->construct; + +- part = umin(stream->sreq_max_len - subreq->len, len); +- _debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len); ++ part = min(subreq->max_len - subreq->len, len); ++ _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + subreq->len += part; + subreq->nr_segs++; +- stream->submit_extendable_to -= part; + +- if (subreq->len >= stream->sreq_max_len || +- subreq->nr_segs >= stream->sreq_max_segs || ++ if (subreq->len >= subreq->max_len || ++ subreq->nr_segs >= subreq->max_nr_segs || + to_eof) { + netfs_issue_write(wreq, stream); + subreq = NULL; +@@ -400,26 +410,19 @@ static int netfs_write_folio(struct netfs_io_request *wreq, + folio_unlock(folio); + + if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { +- if (!cache->avail) { ++ if (!fscache_resources_valid(&wreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); + netfs_issue_write(wreq, upload); + netfs_folio_written_back(folio); + return 0; + } + trace_netfs_folio(folio, netfs_folio_trace_store_copy); +- } else if (!upload->avail && !cache->avail) { +- trace_netfs_folio(folio, netfs_folio_trace_cancel_store); +- netfs_folio_written_back(folio); +- return 0; + } else if (!upload->construct) { + trace_netfs_folio(folio, netfs_folio_trace_store); + } else { + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + } + +- /* Attach the folio to the rolling buffer. */ +- netfs_buffer_append_folio(wreq, folio, false); +- + /* Move the submission point forward to allow for write-streaming data + * not starting at the front of the page. We don't do write-streaming + * with the cache as the cache requires DIO alignment. +@@ -429,6 +432,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, + */ + for (int s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; ++ stream->submit_max_len = fsize; + stream->submit_off = foff; + stream->submit_len = flen; + if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) || +@@ -436,6 +440,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, + fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { + stream->submit_off = UINT_MAX; + stream->submit_len = 0; ++ stream->submit_max_len = 0; + } + } + +@@ -462,13 +467,12 @@ static int netfs_write_folio(struct netfs_io_request *wreq, + if (choose_s < 0) + break; + stream = &wreq->io_streams[choose_s]; +- wreq->io_iter.iov_offset = stream->submit_off; + +- atomic64_set(&wreq->issued_to, fpos + stream->submit_off); +- stream->submit_extendable_to = fsize - stream->submit_off; + part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, + stream->submit_len, to_eof); ++ atomic64_set(&wreq->issued_to, fpos + stream->submit_off); + stream->submit_off += part; ++ stream->submit_max_len -= part; + if (part > stream->submit_len) + stream->submit_len = 0; + else +@@ -477,8 +481,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq, + debug = true; + } + +- wreq->io_iter.iov_offset = 0; +- iov_iter_advance(&wreq->io_iter, fsize); + atomic64_set(&wreq->issued_to, fpos + fsize); + + if (!debug) +@@ -503,14 +505,10 @@ int netfs_writepages(struct address_space *mapping, + struct folio *folio; + int error = 0; + +- if (!mutex_trylock(&ictx->wb_lock)) { +- if (wbc->sync_mode == WB_SYNC_NONE) { +- netfs_stat(&netfs_n_wb_lock_skip); +- return 0; +- } +- netfs_stat(&netfs_n_wb_lock_wait); ++ if (wbc->sync_mode == WB_SYNC_ALL) + mutex_lock(&ictx->wb_lock); +- } ++ else if (!mutex_trylock(&ictx->wb_lock)) ++ return 0; + + /* Need the first folio to be able to set up the op. */ + folio = writeback_iter(mapping, wbc, NULL, &error); +@@ -527,10 +525,10 @@ int netfs_writepages(struct address_space *mapping, + netfs_stat(&netfs_n_wh_writepages); + + do { +- _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to)); ++ _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + + /* It appears we don't have to handle cyclic writeback wrapping. */ +- WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to)); ++ WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); + + if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE && + unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) { +@@ -674,7 +672,6 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t + part = netfs_advance_write(wreq, upload, start, len, false); + start += part; + len -= part; +- iov_iter_advance(&wreq->io_iter, part); + if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { + trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); + wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE); +diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c +index 810269ee0a50..7a558dea75c4 100644 +--- a/fs/nfs/fscache.c ++++ b/fs/nfs/fscache.c +@@ -267,7 +267,6 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi + rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); +- rreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize; + + return 0; + } +@@ -289,6 +288,14 @@ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sre + return netfs; + } + ++static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq) ++{ ++ size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize; ++ ++ sreq->len = min(sreq->len, rsize); ++ return true; ++} ++ + static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) + { + struct nfs_netfs_io_data *netfs; +@@ -297,18 +304,17 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) + struct nfs_open_context *ctx = sreq->rreq->netfs_priv; + struct page *page; + unsigned long idx; +- pgoff_t start, last; + int err; +- +- start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; +- last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT); ++ pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; ++ pgoff_t last = ((sreq->start + sreq->len - ++ sreq->transferred - 1) >> PAGE_SHIFT); + + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + + netfs = nfs_netfs_alloc(sreq); + if (!netfs) +- return netfs_read_subreq_terminated(sreq, -ENOMEM, false); ++ return netfs_subreq_terminated(sreq, -ENOMEM, false); + + pgio.pg_netfs = netfs; /* used in completion */ + +@@ -374,4 +380,5 @@ const struct netfs_request_ops nfs_netfs_ops = { + .init_request = nfs_netfs_init_request, + .free_request = nfs_netfs_free_request, + .issue_read = nfs_netfs_issue_read, ++ .clamp_length = nfs_netfs_clamp_length + }; +diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h +index 772d485e96d3..e8adae1bc260 100644 +--- a/fs/nfs/fscache.h ++++ b/fs/nfs/fscache.h +@@ -60,6 +60,8 @@ static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs) + + static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) + { ++ ssize_t final_len; ++ + /* Only the last RPC completion should call netfs_subreq_terminated() */ + if (!refcount_dec_and_test(&netfs->refcount)) + return; +@@ -72,9 +74,8 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) + * Correct the final length here to be no larger than the netfs subrequest + * length, and thus avoid netfs's "Subreq overread" warning message. + */ +- netfs->sreq->transferred = min_t(s64, netfs->sreq->len, +- atomic64_read(&netfs->transferred)); +- netfs_read_subreq_terminated(netfs->sreq, netfs->error, false); ++ final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred)); ++ netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false); + kfree(netfs); + } + static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) +diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c +index 7481b21a0489..6322f0f68a17 100644 +--- a/fs/smb/client/cifsencrypt.c ++++ b/fs/smb/client/cifsencrypt.c +@@ -21,21 +21,127 @@ + #include + #include + #include +-#include + #include "../common/arc4.h" + #include + +-static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, +- void *priv, void *priv2) ++/* ++ * Hash data from a BVEC-type iterator. ++ */ ++static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize, ++ struct shash_desc *shash) + { +- struct shash_desc *shash = priv; +- int ret, *pret = priv2; ++ const struct bio_vec *bv = iter->bvec; ++ unsigned long start = iter->iov_offset; ++ unsigned int i; ++ void *p; ++ int ret; ++ ++ for (i = 0; i < iter->nr_segs; i++) { ++ size_t off, len; ++ ++ len = bv[i].bv_len; ++ if (start >= len) { ++ start -= len; ++ continue; ++ } ++ ++ len = min_t(size_t, maxsize, len - start); ++ off = bv[i].bv_offset + start; + +- ret = crypto_shash_update(shash, iter_base, len); +- if (ret < 0) { +- *pret = ret; +- return len; ++ p = kmap_local_page(bv[i].bv_page); ++ ret = crypto_shash_update(shash, p + off, len); ++ kunmap_local(p); ++ if (ret < 0) ++ return ret; ++ ++ maxsize -= len; ++ if (maxsize <= 0) ++ break; ++ start = 0; + } ++ ++ return 0; ++} ++ ++/* ++ * Hash data from a KVEC-type iterator. ++ */ ++static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize, ++ struct shash_desc *shash) ++{ ++ const struct kvec *kv = iter->kvec; ++ unsigned long start = iter->iov_offset; ++ unsigned int i; ++ int ret; ++ ++ for (i = 0; i < iter->nr_segs; i++) { ++ size_t len; ++ ++ len = kv[i].iov_len; ++ if (start >= len) { ++ start -= len; ++ continue; ++ } ++ ++ len = min_t(size_t, maxsize, len - start); ++ ret = crypto_shash_update(shash, kv[i].iov_base + start, len); ++ if (ret < 0) ++ return ret; ++ maxsize -= len; ++ ++ if (maxsize <= 0) ++ break; ++ start = 0; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Hash data from an XARRAY-type iterator. ++ */ ++static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, ++ struct shash_desc *shash) ++{ ++ struct folio *folios[16], *folio; ++ unsigned int nr, i, j, npages; ++ loff_t start = iter->xarray_start + iter->iov_offset; ++ pgoff_t last, index = start / PAGE_SIZE; ++ ssize_t ret = 0; ++ size_t len, offset, foffset; ++ void *p; ++ ++ if (maxsize == 0) ++ return 0; ++ ++ last = (start + maxsize - 1) / PAGE_SIZE; ++ do { ++ nr = xa_extract(iter->xarray, (void **)folios, index, last, ++ ARRAY_SIZE(folios), XA_PRESENT); ++ if (nr == 0) ++ return -EIO; ++ ++ for (i = 0; i < nr; i++) { ++ folio = folios[i]; ++ npages = folio_nr_pages(folio); ++ foffset = start - folio_pos(folio); ++ offset = foffset % PAGE_SIZE; ++ for (j = foffset / PAGE_SIZE; j < npages; j++) { ++ len = min_t(size_t, maxsize, PAGE_SIZE - offset); ++ p = kmap_local_page(folio_page(folio, j)); ++ ret = crypto_shash_update(shash, p, len); ++ kunmap_local(p); ++ if (ret < 0) ++ return ret; ++ maxsize -= len; ++ if (maxsize <= 0) ++ return 0; ++ start += len; ++ offset = 0; ++ index++; ++ } ++ } ++ } while (nr == ARRAY_SIZE(folios)); + return 0; + } + +@@ -45,13 +151,21 @@ static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, + static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, + struct shash_desc *shash) + { +- struct iov_iter tmp_iter = *iter; +- int err = -EIO; ++ if (maxsize == 0) ++ return 0; + +- if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err, +- cifs_shash_step) != maxsize) +- return err; +- return 0; ++ switch (iov_iter_type(iter)) { ++ case ITER_BVEC: ++ return cifs_shash_bvec(iter, maxsize, shash); ++ case ITER_KVEC: ++ return cifs_shash_kvec(iter, maxsize, shash); ++ case ITER_XARRAY: ++ return cifs_shash_xarray(iter, maxsize, shash); ++ default: ++ pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter)); ++ WARN_ON_ONCE(1); ++ return -EIO; ++ } + } + + int __cifs_calc_signature(struct smb_rqst *rqst, +diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h +index a71a988a92f9..939ef5844571 100644 +--- a/fs/smb/client/cifsglob.h ++++ b/fs/smb/client/cifsglob.h +@@ -255,7 +255,7 @@ struct smb_rqst { + struct kvec *rq_iov; /* array of kvecs */ + unsigned int rq_nvec; /* number of kvecs in array */ + struct iov_iter rq_iter; /* Data iterator */ +- struct folio_queue *rq_buffer; /* Buffer for encryption */ ++ struct xarray rq_buffer; /* Page buffer for encryption */ + }; + + struct mid_q_entry; +@@ -1485,6 +1485,7 @@ struct cifs_io_subrequest { + struct cifs_io_request *req; + }; + ssize_t got_bytes; ++ size_t actual_len; + unsigned int xid; + int result; + bool have_xid; +@@ -1549,6 +1550,7 @@ struct cifsInodeInfo { + #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ + #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ + #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ ++#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */ + #define CIFS_INO_CLOSE_ON_LOCK (7) /* Not to defer the close when lock is set */ + unsigned long flags; + spinlock_t writers_lock; +diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c +index 131f20b91c3e..31d27a100357 100644 +--- a/fs/smb/client/cifssmb.c ++++ b/fs/smb/client/cifssmb.c +@@ -1334,9 +1334,10 @@ cifs_readv_callback(struct mid_q_entry *mid) + } + + rdata->credits.value = 0; +- rdata->subreq.transferred += rdata->got_bytes; +- INIT_WORK(&rdata->subreq.work, cifs_readv_worker); +- queue_work(cifsiod_wq, &rdata->subreq.work); ++ netfs_subreq_terminated(&rdata->subreq, ++ (rdata->result == 0 || rdata->result == -EAGAIN) ? ++ rdata->got_bytes : rdata->result, ++ false); + release_mid(mid); + add_credits(server, &credits, 0); + } +diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c +index 78b59c4ef3ce..0fb2b0950306 100644 +--- a/fs/smb/client/file.c ++++ b/fs/smb/client/file.c +@@ -49,7 +49,6 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) + struct cifs_io_subrequest *wdata = + container_of(subreq, struct cifs_io_subrequest, subreq); + struct cifs_io_request *req = wdata->req; +- struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr]; + struct TCP_Server_Info *server; + struct cifsFileInfo *open_file = req->cfile; + size_t wsize = req->rreq.wsize; +@@ -74,7 +73,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) + } + } + +- rc = server->ops->wait_mtu_credits(server, wsize, &stream->sreq_max_len, ++ rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len, + &wdata->credits); + if (rc < 0) { + subreq->error = rc; +@@ -93,7 +92,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) + + #ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) +- stream->sreq_max_segs = server->smbd_conn->max_frmr_depth; ++ subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; + #endif + } + +@@ -112,6 +111,7 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq) + goto fail; + } + ++ wdata->actual_len = wdata->subreq.len; + rc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust); + if (rc) + goto fail; +@@ -140,22 +140,25 @@ static void cifs_netfs_invalidate_cache(struct netfs_io_request *wreq) + } + + /* +- * Negotiate the size of a read operation on behalf of the netfs library. ++ * Split the read up according to how many credits we can get for each piece. ++ * It's okay to sleep here if we need to wait for more credit to become ++ * available. ++ * ++ * We also choose the server and allocate an operation ID to be cleaned up ++ * later. + */ +-static int cifs_prepare_read(struct netfs_io_subrequest *subreq) ++static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) + { + struct netfs_io_request *rreq = subreq->rreq; + struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); + struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); +- size_t size; +- int rc = 0; ++ size_t rsize; ++ int rc; + +- if (!rdata->have_xid) { +- rdata->xid = get_xid(); +- rdata->have_xid = true; +- } ++ rdata->xid = get_xid(); ++ rdata->have_xid = true; + rdata->server = server; + + if (cifs_sb->ctx->rsize == 0) +@@ -163,12 +166,13 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) + server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink), + cifs_sb->ctx); + +- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, +- &size, &rdata->credits); +- if (rc) +- return rc; + +- rreq->io_streams[0].sreq_max_len = size; ++ rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, ++ &rsize, &rdata->credits); ++ if (rc) { ++ subreq->error = rc; ++ return false; ++ } + + rdata->credits.in_flight_check = 1; + rdata->credits.rreq_debug_id = rreq->debug_id; +@@ -180,11 +184,14 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) + server->credits, server->in_flight, 0, + cifs_trace_rw_credits_read_submit); + ++ subreq->len = umin(subreq->len, rsize); ++ rdata->actual_len = subreq->len; ++ + #ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) +- rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth; ++ subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; + #endif +- return 0; ++ return true; + } + + /* +@@ -193,41 +200,59 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) + * to only read a portion of that, but as long as we read something, the netfs + * helper will call us again so that we can issue another read. + */ +-static void cifs_issue_read(struct netfs_io_subrequest *subreq) ++static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) + { + struct netfs_io_request *rreq = subreq->rreq; + struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); + struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; ++ struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); + int rc = 0; + + cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", + __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, + subreq->transferred, subreq->len); + +- rc = adjust_credits(server, rdata, cifs_trace_rw_credits_issue_read_adjust); +- if (rc) +- goto failed; ++ if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { ++ /* ++ * As we're issuing a retry, we need to negotiate some new ++ * credits otherwise the server may reject the op with ++ * INVALID_PARAMETER. Note, however, we may get back less ++ * credit than we need to complete the op, in which case, we ++ * shorten the op and rely on additional rounds of retry. ++ */ ++ size_t rsize = umin(subreq->len - subreq->transferred, ++ cifs_sb->ctx->rsize); ++ ++ rc = server->ops->wait_mtu_credits(server, rsize, &rdata->actual_len, ++ &rdata->credits); ++ if (rc) ++ goto out; ++ ++ rdata->credits.in_flight_check = 1; ++ ++ trace_smb3_rw_credits(rdata->rreq->debug_id, ++ rdata->subreq.debug_index, ++ rdata->credits.value, ++ server->credits, server->in_flight, 0, ++ cifs_trace_rw_credits_read_resubmit); ++ } + + if (req->cfile->invalidHandle) { + do { + rc = cifs_reopen_file(req->cfile, true); + } while (rc == -EAGAIN); + if (rc) +- goto failed; ++ goto out; + } + + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + +- trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + rc = rdata->server->ops->async_readv(rdata); ++out: + if (rc) +- goto failed; +- return; +- +-failed: +- netfs_read_subreq_terminated(subreq, rc, false); ++ netfs_subreq_terminated(subreq, rc, false); + } + + /* +@@ -291,6 +316,12 @@ static void cifs_rreq_done(struct netfs_io_request *rreq) + inode_set_atime_to_ts(inode, inode_get_mtime(inode)); + } + ++static void cifs_post_modify(struct inode *inode) ++{ ++ /* Indication to update ctime and mtime as close is deferred */ ++ set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); ++} ++ + static void cifs_free_request(struct netfs_io_request *rreq) + { + struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq); +@@ -338,9 +369,10 @@ const struct netfs_request_ops cifs_req_ops = { + .init_request = cifs_init_request, + .free_request = cifs_free_request, + .free_subrequest = cifs_free_subrequest, +- .prepare_read = cifs_prepare_read, +- .issue_read = cifs_issue_read, ++ .clamp_length = cifs_clamp_length, ++ .issue_read = cifs_req_issue_read, + .done = cifs_rreq_done, ++ .post_modify = cifs_post_modify, + .begin_writeback = cifs_begin_writeback, + .prepare_write = cifs_prepare_write, + .issue_write = cifs_issue_write, +@@ -1364,7 +1396,7 @@ int cifs_close(struct inode *inode, struct file *file) + dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); + if ((cfile->status_file_deleted == false) && + (smb2_can_defer_close(inode, dclose))) { +- if (test_and_clear_bit(NETFS_ICTX_MODIFIED_ATTR, &cinode->netfs.flags)) { ++ if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); + } +diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c +index 7381ec333c6d..c52291b1b175 100644 +--- a/fs/smb/client/smb2ops.c ++++ b/fs/smb/client/smb2ops.c +@@ -13,7 +13,6 @@ + #include + #include + #include +-#include + #include + #include "cifsfs.h" + #include "cifsglob.h" +@@ -302,8 +301,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, + unsigned int /*enum smb3_rw_credits_trace*/ trace) + { + struct cifs_credits *credits = &subreq->credits; +- int new_val = DIV_ROUND_UP(subreq->subreq.len - subreq->subreq.transferred, +- SMB2_MAX_BUFFER_SIZE); ++ int new_val = DIV_ROUND_UP(subreq->actual_len, SMB2_MAX_BUFFER_SIZE); + int scredits, in_flight; + + if (!credits->value || credits->value == new_val) +@@ -4394,86 +4392,30 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, + } + + /* +- * Clear a read buffer, discarding the folios which have the 1st mark set. ++ * Clear a read buffer, discarding the folios which have XA_MARK_0 set. + */ +-static void cifs_clear_folioq_buffer(struct folio_queue *buffer) ++static void cifs_clear_xarray_buffer(struct xarray *buffer) + { +- struct folio_queue *folioq; +- +- while ((folioq = buffer)) { +- for (int s = 0; s < folioq_count(folioq); s++) +- if (folioq_is_marked(folioq, s)) +- folio_put(folioq_folio(folioq, s)); +- buffer = folioq->next; +- kfree(folioq); +- } +-} +- +-/* +- * Allocate buffer space into a folio queue. +- */ +-static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size) +-{ +- struct folio_queue *buffer = NULL, *tail = NULL, *p; + struct folio *folio; +- unsigned int slot; +- +- do { +- if (!tail || folioq_full(tail)) { +- p = kmalloc(sizeof(*p), GFP_NOFS); +- if (!p) +- goto nomem; +- folioq_init(p); +- if (tail) { +- tail->next = p; +- p->prev = tail; +- } else { +- buffer = p; +- } +- tail = p; +- } +- +- folio = folio_alloc(GFP_KERNEL|__GFP_HIGHMEM, 0); +- if (!folio) +- goto nomem; +- +- slot = folioq_append_mark(tail, folio); +- size -= folioq_folio_size(tail, slot); +- } while (size > 0); +- +- return buffer; +- +-nomem: +- cifs_clear_folioq_buffer(buffer); +- return NULL; +-} +- +-/* +- * Copy data from an iterator to the folios in a folio queue buffer. +- */ +-static bool cifs_copy_iter_to_folioq(struct iov_iter *iter, size_t size, +- struct folio_queue *buffer) +-{ +- for (; buffer; buffer = buffer->next) { +- for (int s = 0; s < folioq_count(buffer); s++) { +- struct folio *folio = folioq_folio(buffer, s); +- size_t part = folioq_folio_size(buffer, s); + +- part = umin(part, size); ++ XA_STATE(xas, buffer, 0); + +- if (copy_folio_from_iter(folio, 0, part, iter) != part) +- return false; +- size -= part; +- } ++ rcu_read_lock(); ++ xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) { ++ folio_put(folio); + } +- return true; ++ rcu_read_unlock(); ++ xa_destroy(buffer); + } + + void + smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst) + { +- for (int i = 0; i < num_rqst; i++) +- cifs_clear_folioq_buffer(rqst[i].rq_buffer); ++ int i; ++ ++ for (i = 0; i < num_rqst; i++) ++ if (!xa_empty(&rqst[i].rq_buffer)) ++ cifs_clear_xarray_buffer(&rqst[i].rq_buffer); + } + + /* +@@ -4494,32 +4436,52 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, + struct smb_rqst *new_rq, struct smb_rqst *old_rq) + { + struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base; ++ struct page *page; + unsigned int orig_len = 0; ++ int i, j; + int rc = -ENOMEM; + +- for (int i = 1; i < num_rqst; i++) { ++ for (i = 1; i < num_rqst; i++) { + struct smb_rqst *old = &old_rq[i - 1]; + struct smb_rqst *new = &new_rq[i]; +- struct folio_queue *buffer; +- size_t size = iov_iter_count(&old->rq_iter); ++ struct xarray *buffer = &new->rq_buffer; ++ size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0; + + orig_len += smb_rqst_len(server, old); + new->rq_iov = old->rq_iov; + new->rq_nvec = old->rq_nvec; + ++ xa_init(buffer); ++ + if (size > 0) { +- buffer = cifs_alloc_folioq_buffer(size); +- if (!buffer) +- goto err_free; ++ unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE); ++ ++ for (j = 0; j < npages; j++) { ++ void *o; ++ ++ rc = -ENOMEM; ++ page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); ++ if (!page) ++ goto err_free; ++ page->index = j; ++ o = xa_store(buffer, j, page, GFP_KERNEL); ++ if (xa_is_err(o)) { ++ rc = xa_err(o); ++ put_page(page); ++ goto err_free; ++ } + +- new->rq_buffer = buffer; +- iov_iter_folio_queue(&new->rq_iter, ITER_SOURCE, +- buffer, 0, 0, size); ++ xa_set_mark(buffer, j, XA_MARK_0); + +- if (!cifs_copy_iter_to_folioq(&old->rq_iter, size, buffer)) { +- rc = -EIO; +- goto err_free; ++ seg = min_t(size_t, size - copied, PAGE_SIZE); ++ if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) { ++ rc = -EFAULT; ++ goto err_free; ++ } ++ copied += seg; + } ++ iov_iter_xarray(&new->rq_iter, ITER_SOURCE, ++ buffer, 0, size); + } + } + +@@ -4583,23 +4545,22 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, + } + + static int +-cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size, +- size_t skip, struct iov_iter *iter) ++cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, ++ unsigned int skip, struct iov_iter *iter) + { +- for (; folioq; folioq = folioq->next) { +- for (int s = 0; s < folioq_count(folioq); s++) { +- struct folio *folio = folioq_folio(folioq, s); +- size_t fsize = folio_size(folio); +- size_t n, len = umin(fsize - skip, data_size); +- +- n = copy_folio_to_iter(folio, skip, len, iter); +- if (n != len) { +- cifs_dbg(VFS, "%s: something went wrong\n", __func__); +- return -EIO; +- } +- data_size -= n; +- skip = 0; ++ struct page *page; ++ unsigned long index; ++ ++ xa_for_each(pages, index, page) { ++ size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size); ++ ++ n = copy_page_to_iter(page, skip, len, iter); ++ if (n != len) { ++ cifs_dbg(VFS, "%s: something went wrong\n", __func__); ++ return -EIO; + } ++ data_size -= n; ++ skip = 0; + } + + return 0; +@@ -4607,8 +4568,8 @@ cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size, + + static int + handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, +- char *buf, unsigned int buf_len, struct folio_queue *buffer, +- unsigned int buffer_len, bool is_offloaded) ++ char *buf, unsigned int buf_len, struct xarray *pages, ++ unsigned int pages_len, bool is_offloaded) + { + unsigned int data_offset; + unsigned int data_len; +@@ -4705,7 +4666,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, + return 0; + } + +- if (data_len > buffer_len - pad_len) { ++ if (data_len > pages_len - pad_len) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + if (is_offloaded) +@@ -4716,8 +4677,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, + } + + /* Copy the data to the output I/O iterator. */ +- rdata->result = cifs_copy_folioq_to_iter(buffer, buffer_len, +- cur_off, &rdata->subreq.io_iter); ++ rdata->result = cifs_copy_pages_to_iter(pages, pages_len, ++ cur_off, &rdata->subreq.io_iter); + if (rdata->result != 0) { + if (is_offloaded) + mid->mid_state = MID_RESPONSE_MALFORMED; +@@ -4725,11 +4686,12 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, + dequeue_mid(mid, rdata->result); + return 0; + } +- rdata->got_bytes = buffer_len; ++ rdata->got_bytes = pages_len; + + } else if (buf_len >= data_offset + data_len) { + /* read response payload is in buf */ +- WARN_ONCE(buffer, "read data can be either in buf or in buffer"); ++ WARN_ONCE(pages && !xa_empty(pages), ++ "read data can be either in buf or in pages"); + length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); + if (length < 0) + return length; +@@ -4755,7 +4717,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, + struct smb2_decrypt_work { + struct work_struct decrypt; + struct TCP_Server_Info *server; +- struct folio_queue *buffer; ++ struct xarray buffer; + char *buf; + unsigned int len; + }; +@@ -4769,7 +4731,7 @@ static void smb2_decrypt_offload(struct work_struct *work) + struct mid_q_entry *mid; + struct iov_iter iter; + +- iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, dw->len); ++ iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len); + rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, + &iter, true); + if (rc) { +@@ -4785,7 +4747,7 @@ static void smb2_decrypt_offload(struct work_struct *work) + mid->decrypted = true; + rc = handle_read_data(dw->server, mid, dw->buf, + dw->server->vals->read_rsp_size, +- dw->buffer, dw->len, ++ &dw->buffer, dw->len, + true); + if (rc >= 0) { + #ifdef CONFIG_CIFS_STATS2 +@@ -4818,7 +4780,7 @@ static void smb2_decrypt_offload(struct work_struct *work) + } + + free_pages: +- cifs_clear_folioq_buffer(dw->buffer); ++ cifs_clear_xarray_buffer(&dw->buffer); + cifs_small_buf_release(dw->buf); + kfree(dw); + } +@@ -4828,17 +4790,20 @@ static int + receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, + int *num_mids) + { ++ struct page *page; + char *buf = server->smallbuf; + struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; + struct iov_iter iter; +- unsigned int len; ++ unsigned int len, npages; + unsigned int buflen = server->pdu_size; + int rc; ++ int i = 0; + struct smb2_decrypt_work *dw; + + dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); + if (!dw) + return -ENOMEM; ++ xa_init(&dw->buffer); + INIT_WORK(&dw->decrypt, smb2_decrypt_offload); + dw->server = server; + +@@ -4854,14 +4819,26 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, + len = le32_to_cpu(tr_hdr->OriginalMessageSize) - + server->vals->read_rsp_size; + dw->len = len; +- len = round_up(dw->len, PAGE_SIZE); ++ npages = DIV_ROUND_UP(len, PAGE_SIZE); + + rc = -ENOMEM; +- dw->buffer = cifs_alloc_folioq_buffer(len); +- if (!dw->buffer) +- goto discard_data; ++ for (; i < npages; i++) { ++ void *old; ++ ++ page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); ++ if (!page) ++ goto discard_data; ++ page->index = i; ++ old = xa_store(&dw->buffer, i, page, GFP_KERNEL); ++ if (xa_is_err(old)) { ++ rc = xa_err(old); ++ put_page(page); ++ goto discard_data; ++ } ++ xa_set_mark(&dw->buffer, i, XA_MARK_0); ++ } + +- iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, len); ++ iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE); + + /* Read the data into the buffer and clear excess bufferage. */ + rc = cifs_read_iter_from_socket(server, &iter, dw->len); +@@ -4869,9 +4846,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, + goto discard_data; + + server->total_read += rc; +- if (rc < len) +- iov_iter_zero(len - rc, &iter); +- iov_iter_revert(&iter, len); ++ if (rc < npages * PAGE_SIZE) ++ iov_iter_zero(npages * PAGE_SIZE - rc, &iter); ++ iov_iter_revert(&iter, npages * PAGE_SIZE); + iov_iter_truncate(&iter, dw->len); + + rc = cifs_discard_remaining_data(server); +@@ -4906,7 +4883,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, + (*mid)->decrypted = true; + rc = handle_read_data(server, *mid, buf, + server->vals->read_rsp_size, +- dw->buffer, dw->len, false); ++ &dw->buffer, dw->len, false); + if (rc >= 0) { + if (server->ops->is_network_name_deleted) { + server->ops->is_network_name_deleted(buf, +@@ -4916,7 +4893,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, + } + + free_pages: +- cifs_clear_folioq_buffer(dw->buffer); ++ cifs_clear_xarray_buffer(&dw->buffer); + free_dw: + kfree(dw); + return rc; +diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c +index 2cb1bf65a172..f68746becd64 100644 +--- a/fs/smb/client/smb2pdu.c ++++ b/fs/smb/client/smb2pdu.c +@@ -4499,7 +4499,9 @@ static void smb2_readv_worker(struct work_struct *work) + struct cifs_io_subrequest *rdata = + container_of(work, struct cifs_io_subrequest, subreq.work); + +- netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false); ++ netfs_subreq_terminated(&rdata->subreq, ++ (rdata->result == 0 || rdata->result == -EAGAIN) ? ++ rdata->got_bytes : rdata->result, true); + } + + static void +@@ -4531,7 +4533,7 @@ smb2_readv_callback(struct mid_q_entry *mid) + + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n", + __func__, mid->mid, mid->mid_state, rdata->result, +- rdata->got_bytes, rdata->subreq.len - rdata->subreq.transferred); ++ rdata->actual_len, rdata->subreq.len - rdata->subreq.transferred); + + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: +@@ -4553,7 +4555,6 @@ smb2_readv_callback(struct mid_q_entry *mid) + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: +- __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags); + rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ +@@ -4588,7 +4589,7 @@ smb2_readv_callback(struct mid_q_entry *mid) + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, +- rdata->subreq.len - rdata->subreq.transferred, ++ rdata->actual_len, + rdata->result); + } else + trace_smb3_read_done(rdata->rreq->debug_id, +@@ -4603,9 +4604,9 @@ smb2_readv_callback(struct mid_q_entry *mid) + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } else { +- size_t trans = rdata->subreq.transferred + rdata->got_bytes; +- if (trans < rdata->subreq.len && +- rdata->subreq.start + trans == ictx->remote_i_size) { ++ if (rdata->got_bytes < rdata->actual_len && ++ rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes == ++ ictx->remote_i_size) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } +@@ -4614,8 +4615,6 @@ smb2_readv_callback(struct mid_q_entry *mid) + server->credits, server->in_flight, + 0, cifs_trace_rw_credits_read_response_clear); + rdata->credits.value = 0; +- rdata->subreq.transferred += rdata->got_bytes; +- trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); + INIT_WORK(&rdata->subreq.work, smb2_readv_worker); + queue_work(cifsiod_wq, &rdata->subreq.work); + release_mid(mid); +@@ -4650,7 +4649,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) + io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink); + io_parms.server = server = rdata->server; + io_parms.offset = subreq->start + subreq->transferred; +- io_parms.length = subreq->len - subreq->transferred; ++ io_parms.length = rdata->actual_len; + io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; + io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; + io_parms.pid = rdata->req->pid; +@@ -4671,7 +4670,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) + shdr = (struct smb2_hdr *)buf; + + if (rdata->credits.value > 0) { +- shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(io_parms.length, ++ shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->actual_len, + SMB2_MAX_BUFFER_SIZE)); + credit_request = le16_to_cpu(shdr->CreditCharge) + 8; + if (server->credits >= server->max_credits) +@@ -4699,8 +4698,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) + rdata->xid, io_parms.persistent_fid, + io_parms.tcon->tid, + io_parms.tcon->ses->Suid, +- io_parms.offset, +- subreq->len - subreq->transferred, rc); ++ io_parms.offset, rdata->actual_len, rc); + } + + async_readv_out: +@@ -4883,7 +4881,6 @@ smb2_writev_callback(struct mid_q_entry *mid) + server->credits, server->in_flight, + 0, cifs_trace_rw_credits_write_response_clear); + wdata->credits.value = 0; +- trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); + cifs_write_subrequest_terminated(wdata, result ?: written, true); + release_mid(mid); + trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, +diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c +index 0c64b37e2660..8f782edc3fd7 100644 +--- a/fs/smb/client/smbdirect.c ++++ b/fs/smb/client/smbdirect.c +@@ -6,7 +6,6 @@ + */ + #include + #include +-#include + #include "smbdirect.h" + #include "cifs_debug.h" + #include "cifsproto.h" +@@ -2462,8 +2461,6 @@ static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, + start = 0; + } + +- if (ret > 0) +- iov_iter_advance(iter, ret); + return ret; + } + +@@ -2520,65 +2517,50 @@ static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, + start = 0; + } + +- if (ret > 0) +- iov_iter_advance(iter, ret); + return ret; + } + + /* +- * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA +- * list. The folios are not pinned. ++ * Extract folio fragments from an XARRAY-class iterator and add them to an ++ * RDMA list. The folios are not pinned. + */ +-static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, ++static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) + { +- const struct folio_queue *folioq = iter->folioq; +- unsigned int slot = iter->folioq_slot; ++ struct xarray *xa = iter->xarray; ++ struct folio *folio; ++ loff_t start = iter->xarray_start + iter->iov_offset; ++ pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; +- size_t offset = iter->iov_offset; +- +- BUG_ON(!folioq); ++ size_t off, len; ++ XA_STATE(xas, xa, index); + +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = folioq->next; +- if (WARN_ON_ONCE(!folioq)) +- return -EIO; +- slot = 0; +- } ++ rcu_read_lock(); + +- do { +- struct folio *folio = folioq_folio(folioq, slot); +- size_t fsize = folioq_folio_size(folioq, slot); +- +- if (offset < fsize) { +- size_t part = umin(maxsize - ret, fsize - offset); ++ xas_for_each(&xas, folio, ULONG_MAX) { ++ if (xas_retry(&xas, folio)) ++ continue; ++ if (WARN_ON(xa_is_value(folio))) ++ break; ++ if (WARN_ON(folio_test_hugetlb(folio))) ++ break; + +- if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) +- return -EIO; ++ off = offset_in_folio(folio, start); ++ len = min_t(size_t, maxsize, folio_size(folio) - off); + +- offset += part; +- ret += part; ++ if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) { ++ rcu_read_unlock(); ++ return -EIO; + } + +- if (offset >= fsize) { +- offset = 0; +- slot++; +- if (slot >= folioq_nr_slots(folioq)) { +- if (!folioq->next) { +- WARN_ON_ONCE(ret < iter->count); +- break; +- } +- folioq = folioq->next; +- slot = 0; +- } +- } +- } while (rdma->nr_sge < rdma->max_sge || maxsize > 0); ++ maxsize -= len; ++ ret += len; ++ if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) ++ break; ++ } + +- iter->folioq = folioq; +- iter->folioq_slot = slot; +- iter->iov_offset = offset; +- iter->count -= ret; ++ rcu_read_unlock(); + return ret; + } + +@@ -2606,15 +2588,17 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, + case ITER_KVEC: + ret = smb_extract_kvec_to_rdma(iter, rdma, len); + break; +- case ITER_FOLIOQ: +- ret = smb_extract_folioq_to_rdma(iter, rdma, len); ++ case ITER_XARRAY: ++ ret = smb_extract_xarray_to_rdma(iter, rdma, len); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + +- if (ret < 0) { ++ if (ret > 0) { ++ iov_iter_advance(iter, ret); ++ } else if (ret < 0) { + while (rdma->nr_sge > before) { + struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; + +diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h +deleted file mode 100644 +index 955680c3bb5f..000000000000 +--- a/include/linux/folio_queue.h ++++ /dev/null +@@ -1,156 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-or-later */ +-/* Queue of folios definitions +- * +- * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. +- * Written by David Howells (dhowells@redhat.com) +- */ +- +-#ifndef _LINUX_FOLIO_QUEUE_H +-#define _LINUX_FOLIO_QUEUE_H +- +-#include +- +-/* +- * Segment in a queue of running buffers. Each segment can hold a number of +- * folios and a portion of the queue can be referenced with the ITER_FOLIOQ +- * iterator. The possibility exists of inserting non-folio elements into the +- * queue (such as gaps). +- * +- * Explicit prev and next pointers are used instead of a list_head to make it +- * easier to add segments to tail and remove them from the head without the +- * need for a lock. +- */ +-struct folio_queue { +- struct folio_batch vec; /* Folios in the queue segment */ +- u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ +- struct folio_queue *next; /* Next queue segment or NULL */ +- struct folio_queue *prev; /* Previous queue segment of NULL */ +- unsigned long marks; /* 1-bit mark per folio */ +- unsigned long marks2; /* Second 1-bit mark per folio */ +- unsigned long marks3; /* Third 1-bit mark per folio */ +-#if PAGEVEC_SIZE > BITS_PER_LONG +-#error marks is not big enough +-#endif +-}; +- +-static inline void folioq_init(struct folio_queue *folioq) +-{ +- folio_batch_init(&folioq->vec); +- folioq->next = NULL; +- folioq->prev = NULL; +- folioq->marks = 0; +- folioq->marks2 = 0; +- folioq->marks3 = 0; +-} +- +-static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) +-{ +- return PAGEVEC_SIZE; +-} +- +-static inline unsigned int folioq_count(struct folio_queue *folioq) +-{ +- return folio_batch_count(&folioq->vec); +-} +- +-static inline bool folioq_full(struct folio_queue *folioq) +-{ +- //return !folio_batch_space(&folioq->vec); +- return folioq_count(folioq) >= folioq_nr_slots(folioq); +-} +- +-static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) +-{ +- return test_bit(slot, &folioq->marks); +-} +- +-static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) +-{ +- set_bit(slot, &folioq->marks); +-} +- +-static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) +-{ +- clear_bit(slot, &folioq->marks); +-} +- +-static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) +-{ +- return test_bit(slot, &folioq->marks2); +-} +- +-static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) +-{ +- set_bit(slot, &folioq->marks2); +-} +- +-static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) +-{ +- clear_bit(slot, &folioq->marks2); +-} +- +-static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot) +-{ +- return test_bit(slot, &folioq->marks3); +-} +- +-static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot) +-{ +- set_bit(slot, &folioq->marks3); +-} +- +-static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) +-{ +- clear_bit(slot, &folioq->marks3); +-} +- +-static inline unsigned int __folio_order(struct folio *folio) +-{ +- if (!folio_test_large(folio)) +- return 0; +- return folio->_flags_1 & 0xff; +-} +- +-static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) +-{ +- unsigned int slot = folioq->vec.nr++; +- +- folioq->vec.folios[slot] = folio; +- folioq->orders[slot] = __folio_order(folio); +- return slot; +-} +- +-static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) +-{ +- unsigned int slot = folioq->vec.nr++; +- +- folioq->vec.folios[slot] = folio; +- folioq->orders[slot] = __folio_order(folio); +- folioq_mark(folioq, slot); +- return slot; +-} +- +-static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) +-{ +- return folioq->vec.folios[slot]; +-} +- +-static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) +-{ +- return folioq->orders[slot]; +-} +- +-static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) +-{ +- return PAGE_SIZE << folioq_folio_order(folioq, slot); +-} +- +-static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) +-{ +- folioq->vec.folios[slot] = NULL; +- folioq_unmark(folioq, slot); +- folioq_unmark2(folioq, slot); +- folioq_unmark3(folioq, slot); +-} +- +-#endif /* _LINUX_FOLIO_QUEUE_H */ +diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h +index c4aa58032faf..270454a6703d 100644 +--- a/include/linux/iov_iter.h ++++ b/include/linux/iov_iter.h +@@ -10,7 +10,6 @@ + + #include + #include +-#include + + typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2); +@@ -141,60 +140,6 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, + return progress; + } + +-/* +- * Handle ITER_FOLIOQ. +- */ +-static __always_inline +-size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, +- iov_step_f step) +-{ +- const struct folio_queue *folioq = iter->folioq; +- unsigned int slot = iter->folioq_slot; +- size_t progress = 0, skip = iter->iov_offset; +- +- if (slot == folioq_nr_slots(folioq)) { +- /* The iterator may have been extended. */ +- folioq = folioq->next; +- slot = 0; +- } +- +- do { +- struct folio *folio = folioq_folio(folioq, slot); +- size_t part, remain, consumed; +- size_t fsize; +- void *base; +- +- if (!folio) +- break; +- +- fsize = folioq_folio_size(folioq, slot); +- base = kmap_local_folio(folio, skip); +- part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); +- remain = step(base, progress, part, priv, priv2); +- kunmap_local(base); +- consumed = part - remain; +- len -= consumed; +- progress += consumed; +- skip += consumed; +- if (skip >= fsize) { +- skip = 0; +- slot++; +- if (slot == folioq_nr_slots(folioq) && folioq->next) { +- folioq = folioq->next; +- slot = 0; +- } +- } +- if (remain) +- break; +- } while (len); +- +- iter->folioq_slot = slot; +- iter->folioq = folioq; +- iter->iov_offset = skip; +- iter->count -= progress; +- return progress; +-} +- + /* + * Handle ITER_XARRAY. + */ +@@ -304,8 +249,6 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, + return iterate_bvec(iter, len, priv, priv2, step); + if (iov_iter_is_kvec(iter)) + return iterate_kvec(iter, len, priv, priv2, step); +- if (iov_iter_is_folioq(iter)) +- return iterate_folioq(iter, len, priv, priv2, step); + if (iov_iter_is_xarray(iter)) + return iterate_xarray(iter, len, priv, priv2, step); + return iterate_discard(iter, len, priv, priv2, step); +@@ -328,51 +271,4 @@ size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, + return iterate_and_advance2(iter, len, priv, NULL, ustep, step); + } + +-/** +- * iterate_and_advance_kernel - Iterate over a kernel-internal iterator +- * @iter: The iterator to iterate over. +- * @len: The amount to iterate over. +- * @priv: Data for the step functions. +- * @priv2: More data for the step functions. +- * @step: Function for other iterators; given kernel addresses. +- * +- * Iterate over the next part of an iterator, up to the specified length. The +- * buffer is presented in segments, which for kernel iteration are broken up by +- * physical pages and mapped, with the mapped address being presented. +- * +- * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type +- * iterators; it will not handle UBUF or IOVEC-type iterators. +- * +- * A step functions, @step, must be provided, one for handling mapped kernel +- * addresses and the other is given user addresses which have the potential to +- * fault since no pinning is performed. +- * +- * The step functions are passed the address and length of the segment, @priv, +- * @priv2 and the amount of data so far iterated over (which can, for example, +- * be added to @priv to point to the right part of a second buffer). The step +- * functions should return the amount of the segment they didn't process (ie. 0 +- * indicates complete processsing). +- * +- * This function returns the amount of data processed (ie. 0 means nothing was +- * processed and the value of @len means processes to completion). +- */ +-static __always_inline +-size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, +- void *priv2, iov_step_f step) +-{ +- if (unlikely(iter->count < len)) +- len = iter->count; +- if (unlikely(!len)) +- return 0; +- if (iov_iter_is_bvec(iter)) +- return iterate_bvec(iter, len, priv, priv2, step); +- if (iov_iter_is_kvec(iter)) +- return iterate_kvec(iter, len, priv, priv2, step); +- if (iov_iter_is_folioq(iter)) +- return iterate_folioq(iter, len, priv, priv2, step); +- if (iov_iter_is_xarray(iter)) +- return iterate_xarray(iter, len, priv, priv2, step); +- return iterate_discard(iter, len, priv, priv2, step); +-} +- + #endif /* _LINUX_IOV_ITER_H */ +diff --git a/include/linux/netfs.h b/include/linux/netfs.h +index 5eaceef41e6c..c47443e7a97e 100644 +--- a/include/linux/netfs.h ++++ b/include/linux/netfs.h +@@ -38,8 +38,11 @@ static inline void folio_start_private_2(struct folio *folio) + folio_set_private_2(folio); + } + ++/* Marks used on xarray-based buffers */ ++#define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ ++#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ ++ + enum netfs_io_source { +- NETFS_SOURCE_UNKNOWN, + NETFS_FILL_WITH_ZEROES, + NETFS_DOWNLOAD_FROM_SERVER, + NETFS_READ_FROM_CACHE, +@@ -70,7 +73,6 @@ struct netfs_inode { + #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ + #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ + #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ +-#define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ + }; + + /* +@@ -131,11 +133,9 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio) + struct netfs_io_stream { + /* Submission tracking */ + struct netfs_io_subrequest *construct; /* Op being constructed */ +- size_t sreq_max_len; /* Maximum size of a subrequest */ +- unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */ + unsigned int submit_off; /* Folio offset we're submitting from */ + unsigned int submit_len; /* Amount of data left to submit */ +- unsigned int submit_extendable_to; /* Amount I/O can be rounded up to */ ++ unsigned int submit_max_len; /* Amount I/O can be rounded up to */ + void (*prepare_write)(struct netfs_io_subrequest *subreq); + void (*issue_write)(struct netfs_io_subrequest *subreq); + /* Collection tracking */ +@@ -176,45 +176,41 @@ struct netfs_io_subrequest { + struct list_head rreq_link; /* Link in rreq->subrequests */ + struct iov_iter io_iter; /* Iterator for this subrequest */ + unsigned long long start; /* Where to start the I/O */ ++ size_t max_len; /* Maximum size of the I/O */ + size_t len; /* Size of the I/O */ + size_t transferred; /* Amount of data transferred */ +- size_t consumed; /* Amount of read data consumed */ +- size_t prev_donated; /* Amount of data donated from previous subreq */ +- size_t next_donated; /* Amount of data donated from next subreq */ + refcount_t ref; + short error; /* 0 or error that occurred */ + unsigned short debug_index; /* Index in list (for debugging output) */ + unsigned int nr_segs; /* Number of segs in io_iter */ ++ unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ + enum netfs_io_source source; /* Where to read from/write to */ + unsigned char stream_nr; /* I/O stream this belongs to */ +- unsigned char curr_folioq_slot; /* Folio currently being read */ +- unsigned char curr_folio_order; /* Order of folio */ +- struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ + unsigned long flags; + #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ + #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ ++#define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */ + #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ + #define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ + #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ + #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ +-#define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ + #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ + #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ + #define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ + #define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ ++#define NETFS_SREQ_HIT_EOF 12 /* Set if we hit the EOF */ + }; + + enum netfs_io_origin { + NETFS_READAHEAD, /* This read was triggered by readahead */ + NETFS_READPAGE, /* This read is a synchronous read */ +- NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ + NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ +- NETFS_DIO_READ, /* This is a direct I/O read */ ++ NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */ + NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ + NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ ++ NETFS_DIO_READ, /* This is a direct I/O read */ + NETFS_DIO_WRITE, /* This is a direct I/O write */ +- NETFS_PGPRIV2_COPY_TO_CACHE, /* [DEPRECATED] This is writing read data to the cache */ + nr__netfs_io_origin + } __mode(byte); + +@@ -231,14 +227,11 @@ struct netfs_io_request { + struct address_space *mapping; /* The mapping being accessed */ + struct kiocb *iocb; /* AIO completion vector */ + struct netfs_cache_resources cache_resources; +- struct readahead_control *ractl; /* Readahead descriptor */ + struct list_head proc_link; /* Link in netfs_iorequests */ + struct list_head subrequests; /* Contributory I/O operations */ + struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ + #define NR_IO_STREAMS 2 //wreq->nr_io_streams + struct netfs_group *group; /* Writeback group being written back */ +- struct folio_queue *buffer; /* Head of I/O buffer */ +- struct folio_queue *buffer_tail; /* Tail of I/O buffer */ + struct iov_iter iter; /* Unencrypted-side iterator */ + struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ + void *netfs_priv; /* Private data for the netfs */ +@@ -252,23 +245,24 @@ struct netfs_io_request { + unsigned int nr_group_rel; /* Number of refs to release on ->group */ + spinlock_t lock; /* Lock for queuing subreqs */ + atomic_t nr_outstanding; /* Number of ops in progress */ ++ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ ++ size_t upper_len; /* Length can be extended to here */ + unsigned long long submitted; /* Amount submitted for I/O so far */ + unsigned long long len; /* Length of the request */ + size_t transferred; /* Amount to be indicated as transferred */ +- long error; /* 0 or error that occurred */ ++ short error; /* 0 or error that occurred */ + enum netfs_io_origin origin; /* Origin of the request */ + bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ +- u8 buffer_head_slot; /* First slot in ->buffer */ +- u8 buffer_tail_slot; /* Next slot in ->buffer_tail */ + unsigned long long i_size; /* Size of the file */ + unsigned long long start; /* Start position */ + atomic64_t issued_to; /* Write issuer folio cursor */ ++ unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */ + unsigned long long collected_to; /* Point we've collected to */ + unsigned long long cleaned_to; /* Position we've cleaned folios to */ + pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ +- size_t prev_donated; /* Fallback for subreq->prev_donated */ + refcount_t ref; + unsigned long flags; ++#define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */ + #define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ + #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ + #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ +@@ -280,7 +274,6 @@ struct netfs_io_request { + #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ + #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ + #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ +-#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ + #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark + * write to cache on read */ + const struct netfs_request_ops *netfs_ops; +@@ -299,7 +292,7 @@ struct netfs_request_ops { + + /* Read request handling */ + void (*expand_readahead)(struct netfs_io_request *rreq); +- int (*prepare_read)(struct netfs_io_subrequest *subreq); ++ bool (*clamp_length)(struct netfs_io_subrequest *subreq); + void (*issue_read)(struct netfs_io_subrequest *subreq); + bool (*is_still_valid)(struct netfs_io_request *rreq); + int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, +@@ -429,10 +422,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); + vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); + + /* (Sub)request management API. */ +-void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, +- bool was_async); +-void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, +- int error, bool was_async); ++void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); + void netfs_get_subrequest(struct netfs_io_subrequest *subreq, + enum netfs_sreq_ref_trace what); + void netfs_put_subrequest(struct netfs_io_subrequest *subreq, +diff --git a/include/linux/uio.h b/include/linux/uio.h +index 853f9de5aa05..7020adedfa08 100644 +--- a/include/linux/uio.h ++++ b/include/linux/uio.h +@@ -11,7 +11,6 @@ + #include + + struct page; +-struct folio_queue; + + typedef unsigned int __bitwise iov_iter_extraction_t; + +@@ -26,7 +25,6 @@ enum iter_type { + ITER_IOVEC, + ITER_BVEC, + ITER_KVEC, +- ITER_FOLIOQ, + ITER_XARRAY, + ITER_DISCARD, + }; +@@ -68,7 +66,6 @@ struct iov_iter { + const struct iovec *__iov; + const struct kvec *kvec; + const struct bio_vec *bvec; +- const struct folio_queue *folioq; + struct xarray *xarray; + void __user *ubuf; + }; +@@ -77,7 +74,6 @@ struct iov_iter { + }; + union { + unsigned long nr_segs; +- u8 folioq_slot; + loff_t xarray_start; + }; + }; +@@ -130,11 +126,6 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i) + return iov_iter_type(i) == ITER_DISCARD; + } + +-static inline bool iov_iter_is_folioq(const struct iov_iter *i) +-{ +- return iov_iter_type(i) == ITER_FOLIOQ; +-} +- + static inline bool iov_iter_is_xarray(const struct iov_iter *i) + { + return iov_iter_type(i) == ITER_XARRAY; +@@ -189,12 +180,6 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, + return copy_page_to_iter(&folio->page, offset, bytes, i); + } + +-static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, +- size_t bytes, struct iov_iter *i) +-{ +- return copy_page_from_iter(&folio->page, offset, bytes, i); +-} +- + static inline size_t copy_folio_from_iter_atomic(struct folio *folio, + size_t offset, size_t bytes, struct iov_iter *i) + { +@@ -288,9 +273,6 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec + void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, + unsigned long nr_segs, size_t count); + void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); +-void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, +- const struct folio_queue *folioq, +- unsigned int first_slot, unsigned int offset, size_t count); + void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, + loff_t start, size_t count); + ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, +diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h +index 76bd42a96815..606b4a0f92da 100644 +--- a/include/trace/events/netfs.h ++++ b/include/trace/events/netfs.h +@@ -20,7 +20,6 @@ + EM(netfs_read_trace_expanded, "EXPANDED ") \ + EM(netfs_read_trace_readahead, "READAHEAD") \ + EM(netfs_read_trace_readpage, "READPAGE ") \ +- EM(netfs_read_trace_read_gaps, "READ-GAPS") \ + EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \ + E_(netfs_read_trace_write_begin, "WRITEBEGN") + +@@ -34,14 +33,13 @@ + #define netfs_rreq_origins \ + EM(NETFS_READAHEAD, "RA") \ + EM(NETFS_READPAGE, "RP") \ +- EM(NETFS_READ_GAPS, "RG") \ + EM(NETFS_READ_FOR_WRITE, "RW") \ +- EM(NETFS_DIO_READ, "DR") \ ++ EM(NETFS_COPY_TO_CACHE, "CC") \ + EM(NETFS_WRITEBACK, "WB") \ + EM(NETFS_WRITETHROUGH, "WT") \ + EM(NETFS_UNBUFFERED_WRITE, "UW") \ +- EM(NETFS_DIO_WRITE, "DW") \ +- E_(NETFS_PGPRIV2_COPY_TO_CACHE, "2C") ++ EM(NETFS_DIO_READ, "DR") \ ++ E_(NETFS_DIO_WRITE, "DW") + + #define netfs_rreq_traces \ + EM(netfs_rreq_trace_assess, "ASSESS ") \ +@@ -62,7 +60,6 @@ + E_(netfs_rreq_trace_write_done, "WR-DONE") + + #define netfs_sreq_sources \ +- EM(NETFS_SOURCE_UNKNOWN, "----") \ + EM(NETFS_FILL_WITH_ZEROES, "ZERO") \ + EM(NETFS_DOWNLOAD_FROM_SERVER, "DOWN") \ + EM(NETFS_READ_FROM_CACHE, "READ") \ +@@ -72,25 +69,15 @@ + E_(NETFS_INVALID_WRITE, "INVL") + + #define netfs_sreq_traces \ +- EM(netfs_sreq_trace_add_donations, "+DON ") \ +- EM(netfs_sreq_trace_added, "ADD ") \ +- EM(netfs_sreq_trace_clear, "CLEAR") \ + EM(netfs_sreq_trace_discard, "DSCRD") \ +- EM(netfs_sreq_trace_donate_to_prev, "DON-P") \ +- EM(netfs_sreq_trace_donate_to_next, "DON-N") \ + EM(netfs_sreq_trace_download_instead, "RDOWN") \ + EM(netfs_sreq_trace_fail, "FAIL ") \ + EM(netfs_sreq_trace_free, "FREE ") \ +- EM(netfs_sreq_trace_hit_eof, "EOF ") \ +- EM(netfs_sreq_trace_io_progress, "IO ") \ + EM(netfs_sreq_trace_limited, "LIMIT") \ + EM(netfs_sreq_trace_prepare, "PREP ") \ + EM(netfs_sreq_trace_prep_failed, "PRPFL") \ +- EM(netfs_sreq_trace_progress, "PRGRS") \ +- EM(netfs_sreq_trace_reprep_failed, "REPFL") \ ++ EM(netfs_sreq_trace_resubmit_short, "SHORT") \ + EM(netfs_sreq_trace_retry, "RETRY") \ +- EM(netfs_sreq_trace_short, "SHORT") \ +- EM(netfs_sreq_trace_split, "SPLIT") \ + EM(netfs_sreq_trace_submit, "SUBMT") \ + EM(netfs_sreq_trace_terminated, "TERM ") \ + EM(netfs_sreq_trace_write, "WRITE") \ +@@ -131,7 +118,7 @@ + EM(netfs_sreq_trace_new, "NEW ") \ + EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \ + EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \ +- EM(netfs_sreq_trace_put_consumed, "PUT CONSUME") \ ++ EM(netfs_sreq_trace_put_discard, "PUT DISCARD") \ + EM(netfs_sreq_trace_put_done, "PUT DONE ") \ + EM(netfs_sreq_trace_put_failed, "PUT FAILED ") \ + EM(netfs_sreq_trace_put_merged, "PUT MERGED ") \ +@@ -142,6 +129,7 @@ + E_(netfs_sreq_trace_put_terminated, "PUT TERM ") + + #define netfs_folio_traces \ ++ /* The first few correspond to enum netfs_how_to_modify */ \ + EM(netfs_folio_is_uptodate, "mod-uptodate") \ + EM(netfs_just_prefetch, "mod-prefetch") \ + EM(netfs_whole_folio_modify, "mod-whole-f") \ +@@ -151,9 +139,8 @@ + EM(netfs_flush_content, "flush") \ + EM(netfs_streaming_filled_page, "mod-streamw-f") \ + EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ +- EM(netfs_folio_trace_abandon, "abandon") \ ++ /* The rest are for writeback */ \ + EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ +- EM(netfs_folio_trace_cancel_store, "cancel-store") \ + EM(netfs_folio_trace_clear, "clear") \ + EM(netfs_folio_trace_clear_cc, "clear-cc") \ + EM(netfs_folio_trace_clear_g, "clear-g") \ +@@ -168,12 +155,7 @@ + EM(netfs_folio_trace_mkwrite, "mkwrite") \ + EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ + EM(netfs_folio_trace_not_under_wback, "!wback") \ +- EM(netfs_folio_trace_put, "put") \ +- EM(netfs_folio_trace_read, "read") \ +- EM(netfs_folio_trace_read_done, "read-done") \ + EM(netfs_folio_trace_read_gaps, "read-gaps") \ +- EM(netfs_folio_trace_read_put, "read-put") \ +- EM(netfs_folio_trace_read_unlock, "read-unlock") \ + EM(netfs_folio_trace_redirtied, "redirtied") \ + EM(netfs_folio_trace_store, "store") \ + EM(netfs_folio_trace_store_copy, "store-copy") \ +@@ -186,12 +168,6 @@ + EM(netfs_contig_trace_jump, "-->JUMP-->") \ + E_(netfs_contig_trace_unlock, "Unlock") + +-#define netfs_donate_traces \ +- EM(netfs_trace_donate_tail_to_prev, "tail-to-prev") \ +- EM(netfs_trace_donate_to_prev, "to-prev") \ +- EM(netfs_trace_donate_to_next, "to-next") \ +- E_(netfs_trace_donate_to_deferred_next, "defer-next") +- + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY + #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY + +@@ -209,7 +185,6 @@ enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte); + enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte); + enum netfs_folio_trace { netfs_folio_traces } __mode(byte); + enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte); +-enum netfs_donate_trace { netfs_donate_traces } __mode(byte); + + #endif + +@@ -232,7 +207,6 @@ netfs_rreq_ref_traces; + netfs_sreq_ref_traces; + netfs_folio_traces; + netfs_collect_contig_traces; +-netfs_donate_traces; + + /* + * Now redefine the EM() and E_() macros to map the enums to the strings that +@@ -253,7 +227,6 @@ TRACE_EVENT(netfs_read, + TP_STRUCT__entry( + __field(unsigned int, rreq ) + __field(unsigned int, cookie ) +- __field(loff_t, i_size ) + __field(loff_t, start ) + __field(size_t, len ) + __field(enum netfs_read_trace, what ) +@@ -263,19 +236,18 @@ TRACE_EVENT(netfs_read, + TP_fast_assign( + __entry->rreq = rreq->debug_id; + __entry->cookie = rreq->cache_resources.debug_id; +- __entry->i_size = rreq->i_size; + __entry->start = start; + __entry->len = len; + __entry->what = what; + __entry->netfs_inode = rreq->inode->i_ino; + ), + +- TP_printk("R=%08x %s c=%08x ni=%x s=%llx l=%zx sz=%llx", ++ TP_printk("R=%08x %s c=%08x ni=%x s=%llx %zx", + __entry->rreq, + __print_symbolic(__entry->what, netfs_read_traces), + __entry->cookie, + __entry->netfs_inode, +- __entry->start, __entry->len, __entry->i_size) ++ __entry->start, __entry->len) + ); + + TRACE_EVENT(netfs_rreq, +@@ -541,6 +513,33 @@ TRACE_EVENT(netfs_collect, + __entry->start + __entry->len) + ); + ++TRACE_EVENT(netfs_collect_contig, ++ TP_PROTO(const struct netfs_io_request *wreq, unsigned long long to, ++ enum netfs_collect_contig_trace type), ++ ++ TP_ARGS(wreq, to, type), ++ ++ TP_STRUCT__entry( ++ __field(unsigned int, wreq) ++ __field(enum netfs_collect_contig_trace, type) ++ __field(unsigned long long, contiguity) ++ __field(unsigned long long, to) ++ ), ++ ++ TP_fast_assign( ++ __entry->wreq = wreq->debug_id; ++ __entry->type = type; ++ __entry->contiguity = wreq->contiguity; ++ __entry->to = to; ++ ), ++ ++ TP_printk("R=%08x %llx -> %llx %s", ++ __entry->wreq, ++ __entry->contiguity, ++ __entry->to, ++ __print_symbolic(__entry->type, netfs_collect_contig_traces)) ++ ); ++ + TRACE_EVENT(netfs_collect_sreq, + TP_PROTO(const struct netfs_io_request *wreq, + const struct netfs_io_subrequest *subreq), +@@ -612,6 +611,7 @@ TRACE_EVENT(netfs_collect_state, + __field(unsigned int, notes ) + __field(unsigned long long, collected_to ) + __field(unsigned long long, cleaned_to ) ++ __field(unsigned long long, contiguity ) + ), + + TP_fast_assign( +@@ -619,11 +619,12 @@ TRACE_EVENT(netfs_collect_state, + __entry->notes = notes; + __entry->collected_to = collected_to; + __entry->cleaned_to = wreq->cleaned_to; ++ __entry->contiguity = wreq->contiguity; + ), + +- TP_printk("R=%08x col=%llx cln=%llx n=%x", ++ TP_printk("R=%08x cto=%llx fto=%llx ctg=%llx n=%x", + __entry->wreq, __entry->collected_to, +- __entry->cleaned_to, ++ __entry->cleaned_to, __entry->contiguity, + __entry->notes) + ); + +@@ -680,71 +681,6 @@ TRACE_EVENT(netfs_collect_stream, + __entry->collected_to, __entry->front) + ); + +-TRACE_EVENT(netfs_progress, +- TP_PROTO(const struct netfs_io_subrequest *subreq, +- unsigned long long start, size_t avail, size_t part), +- +- TP_ARGS(subreq, start, avail, part), +- +- TP_STRUCT__entry( +- __field(unsigned int, rreq) +- __field(unsigned int, subreq) +- __field(unsigned int, consumed) +- __field(unsigned int, transferred) +- __field(unsigned long long, f_start) +- __field(unsigned int, f_avail) +- __field(unsigned int, f_part) +- __field(unsigned char, slot) +- ), +- +- TP_fast_assign( +- __entry->rreq = subreq->rreq->debug_id; +- __entry->subreq = subreq->debug_index; +- __entry->consumed = subreq->consumed; +- __entry->transferred = subreq->transferred; +- __entry->f_start = start; +- __entry->f_avail = avail; +- __entry->f_part = part; +- __entry->slot = subreq->curr_folioq_slot; +- ), +- +- TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x", +- __entry->rreq, __entry->subreq, __entry->f_start, +- __entry->consumed, __entry->transferred, +- __entry->f_part, __entry->f_avail, __entry->slot) +- ); +- +-TRACE_EVENT(netfs_donate, +- TP_PROTO(const struct netfs_io_request *rreq, +- const struct netfs_io_subrequest *from, +- const struct netfs_io_subrequest *to, +- size_t amount, +- enum netfs_donate_trace trace), +- +- TP_ARGS(rreq, from, to, amount, trace), +- +- TP_STRUCT__entry( +- __field(unsigned int, rreq) +- __field(unsigned int, from) +- __field(unsigned int, to) +- __field(unsigned int, amount) +- __field(enum netfs_donate_trace, trace) +- ), +- +- TP_fast_assign( +- __entry->rreq = rreq->debug_id; +- __entry->from = from->debug_index; +- __entry->to = to ? to->debug_index : -1; +- __entry->amount = amount; +- __entry->trace = trace; +- ), +- +- TP_printk("R=%08x[%02x] -> [%02x] %s am=%x", +- __entry->rreq, __entry->from, __entry->to, +- __print_symbolic(__entry->trace, netfs_donate_traces), +- __entry->amount) +- ); +- + #undef EM + #undef E_ + #endif /* _TRACE_NETFS_H */ +diff --git a/lib/iov_iter.c b/lib/iov_iter.c +index 97003155bfac..4a6a9f419bd7 100644 +--- a/lib/iov_iter.c ++++ b/lib/iov_iter.c +@@ -527,39 +527,6 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) + i->__iov = iov; + } + +-static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) +-{ +- const struct folio_queue *folioq = i->folioq; +- unsigned int slot = i->folioq_slot; +- +- if (!i->count) +- return; +- i->count -= size; +- +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = folioq->next; +- slot = 0; +- } +- +- size += i->iov_offset; /* From beginning of current segment. */ +- do { +- size_t fsize = folioq_folio_size(folioq, slot); +- +- if (likely(size < fsize)) +- break; +- size -= fsize; +- slot++; +- if (slot >= folioq_nr_slots(folioq) && folioq->next) { +- folioq = folioq->next; +- slot = 0; +- } +- } while (size); +- +- i->iov_offset = size; +- i->folioq_slot = slot; +- i->folioq = folioq; +-} +- + void iov_iter_advance(struct iov_iter *i, size_t size) + { + if (unlikely(i->count < size)) +@@ -572,40 +539,12 @@ void iov_iter_advance(struct iov_iter *i, size_t size) + iov_iter_iovec_advance(i, size); + } else if (iov_iter_is_bvec(i)) { + iov_iter_bvec_advance(i, size); +- } else if (iov_iter_is_folioq(i)) { +- iov_iter_folioq_advance(i, size); + } else if (iov_iter_is_discard(i)) { + i->count -= size; + } + } + EXPORT_SYMBOL(iov_iter_advance); + +-static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) +-{ +- const struct folio_queue *folioq = i->folioq; +- unsigned int slot = i->folioq_slot; +- +- for (;;) { +- size_t fsize; +- +- if (slot == 0) { +- folioq = folioq->prev; +- slot = folioq_nr_slots(folioq); +- } +- slot--; +- +- fsize = folioq_folio_size(folioq, slot); +- if (unroll <= fsize) { +- i->iov_offset = fsize - unroll; +- break; +- } +- unroll -= fsize; +- } +- +- i->folioq_slot = slot; +- i->folioq = folioq; +-} +- + void iov_iter_revert(struct iov_iter *i, size_t unroll) + { + if (!unroll) +@@ -637,9 +576,6 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) + } + unroll -= n; + } +- } else if (iov_iter_is_folioq(i)) { +- i->iov_offset = 0; +- iov_iter_folioq_revert(i, unroll); + } else { /* same logics for iovec and kvec */ + const struct iovec *iov = iter_iov(i); + while (1) { +@@ -667,9 +603,6 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) + if (iov_iter_is_bvec(i)) + return min(i->count, i->bvec->bv_len - i->iov_offset); + } +- if (unlikely(iov_iter_is_folioq(i))) +- return !i->count ? 0 : +- umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); + return i->count; + } + EXPORT_SYMBOL(iov_iter_single_seg_count); +@@ -706,36 +639,6 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, + } + EXPORT_SYMBOL(iov_iter_bvec); + +-/** +- * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue +- * @i: The iterator to initialise. +- * @direction: The direction of the transfer. +- * @folioq: The starting point in the folio queue. +- * @first_slot: The first slot in the folio queue to use +- * @offset: The offset into the folio in the first slot to start at +- * @count: The size of the I/O buffer in bytes. +- * +- * Set up an I/O iterator to either draw data out of the pages attached to an +- * inode or to inject data into those pages. The pages *must* be prevented +- * from evaporation, either by taking a ref on them or locking them by the +- * caller. +- */ +-void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, +- const struct folio_queue *folioq, unsigned int first_slot, +- unsigned int offset, size_t count) +-{ +- BUG_ON(direction & ~1); +- *i = (struct iov_iter) { +- .iter_type = ITER_FOLIOQ, +- .data_source = direction, +- .folioq = folioq, +- .folioq_slot = first_slot, +- .count = count, +- .iov_offset = offset, +- }; +-} +-EXPORT_SYMBOL(iov_iter_folio_queue); +- + /** + * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray + * @i: The iterator to initialise. +@@ -862,19 +765,12 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, + if (iov_iter_is_bvec(i)) + return iov_iter_aligned_bvec(i, addr_mask, len_mask); + +- /* With both xarray and folioq types, we're dealing with whole folios. */ + if (iov_iter_is_xarray(i)) { + if (i->count & len_mask) + return false; + if ((i->xarray_start + i->iov_offset) & addr_mask) + return false; + } +- if (iov_iter_is_folioq(i)) { +- if (i->count & len_mask) +- return false; +- if (i->iov_offset & addr_mask) +- return false; +- } + + return true; + } +@@ -939,9 +835,6 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) + if (iov_iter_is_bvec(i)) + return iov_iter_alignment_bvec(i); + +- /* With both xarray and folioq types, we're dealing with whole folios. */ +- if (iov_iter_is_folioq(i)) +- return i->iov_offset | i->count; + if (iov_iter_is_xarray(i)) + return (i->xarray_start + i->iov_offset) | i->count; + +@@ -994,62 +887,6 @@ static int want_pages_array(struct page ***res, size_t size, + return count; + } + +-static ssize_t iter_folioq_get_pages(struct iov_iter *iter, +- struct page ***ppages, size_t maxsize, +- unsigned maxpages, size_t *_start_offset) +-{ +- const struct folio_queue *folioq = iter->folioq; +- struct page **pages; +- unsigned int slot = iter->folioq_slot; +- size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; +- +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = folioq->next; +- slot = 0; +- if (WARN_ON(iov_offset != 0)) +- return -EIO; +- } +- +- maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); +- if (!maxpages) +- return -ENOMEM; +- *_start_offset = iov_offset & ~PAGE_MASK; +- pages = *ppages; +- +- for (;;) { +- struct folio *folio = folioq_folio(folioq, slot); +- size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); +- size_t part = PAGE_SIZE - offset % PAGE_SIZE; +- +- part = umin(part, umin(maxsize - extracted, fsize - offset)); +- count -= part; +- iov_offset += part; +- extracted += part; +- +- *pages = folio_page(folio, offset / PAGE_SIZE); +- get_page(*pages); +- pages++; +- maxpages--; +- if (maxpages == 0 || extracted >= maxsize) +- break; +- +- if (offset >= fsize) { +- iov_offset = 0; +- slot++; +- if (slot == folioq_nr_slots(folioq) && folioq->next) { +- folioq = folioq->next; +- slot = 0; +- } +- } +- } +- +- iter->count = count; +- iter->iov_offset = iov_offset; +- iter->folioq = folioq; +- iter->folioq_slot = slot; +- return extracted; +-} +- + static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, + pgoff_t index, unsigned int nr_pages) + { +@@ -1197,8 +1034,6 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, + } + return maxsize; + } +- if (iov_iter_is_folioq(i)) +- return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); + if (iov_iter_is_xarray(i)) + return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); + return -EFAULT; +@@ -1283,11 +1118,6 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) + return iov_npages(i, maxpages); + if (iov_iter_is_bvec(i)) + return bvec_npages(i, maxpages); +- if (iov_iter_is_folioq(i)) { +- unsigned offset = i->iov_offset % PAGE_SIZE; +- int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); +- return min(npages, maxpages); +- } + if (iov_iter_is_xarray(i)) { + unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; + int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); +@@ -1568,68 +1398,6 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) + i->nr_segs = state->nr_segs; + } + +-/* +- * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does +- * not get references on the pages, nor does it get a pin on them. +- */ +-static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, +- struct page ***pages, size_t maxsize, +- unsigned int maxpages, +- iov_iter_extraction_t extraction_flags, +- size_t *offset0) +-{ +- const struct folio_queue *folioq = i->folioq; +- struct page **p; +- unsigned int nr = 0; +- size_t extracted = 0, offset, slot = i->folioq_slot; +- +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = folioq->next; +- slot = 0; +- if (WARN_ON(i->iov_offset != 0)) +- return -EIO; +- } +- +- offset = i->iov_offset & ~PAGE_MASK; +- *offset0 = offset; +- +- maxpages = want_pages_array(pages, maxsize, offset, maxpages); +- if (!maxpages) +- return -ENOMEM; +- p = *pages; +- +- for (;;) { +- struct folio *folio = folioq_folio(folioq, slot); +- size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); +- size_t part = PAGE_SIZE - offset % PAGE_SIZE; +- +- if (offset < fsize) { +- part = umin(part, umin(maxsize - extracted, fsize - offset)); +- i->count -= part; +- i->iov_offset += part; +- extracted += part; +- +- p[nr++] = folio_page(folio, offset / PAGE_SIZE); +- } +- +- if (nr >= maxpages || extracted >= maxsize) +- break; +- +- if (i->iov_offset >= fsize) { +- i->iov_offset = 0; +- slot++; +- if (slot == folioq_nr_slots(folioq) && folioq->next) { +- folioq = folioq->next; +- slot = 0; +- } +- } +- } +- +- i->folioq = folioq; +- i->folioq_slot = slot; +- return extracted; +-} +- + /* + * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not + * get references on the pages, nor does it get a pin on them. +@@ -1850,8 +1618,8 @@ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, + * added to the pages, but refs will not be taken. + * iov_iter_extract_will_pin() will return true. + * +- * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the +- * pages are merely listed; no extra refs or pins are obtained. ++ * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are ++ * merely listed; no extra refs or pins are obtained. + * iov_iter_extract_will_pin() will return 0. + * + * Note also: +@@ -1886,10 +1654,6 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, + return iov_iter_extract_bvec_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); +- if (iov_iter_is_folioq(i)) +- return iov_iter_extract_folioq_pages(i, pages, maxsize, +- maxpages, extraction_flags, +- offset0); + if (iov_iter_is_xarray(i)) + return iov_iter_extract_xarray_pages(i, pages, maxsize, + maxpages, extraction_flags, +diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c +index 13e15687675a..27e0c8ee71d8 100644 +--- a/lib/kunit_iov_iter.c ++++ b/lib/kunit_iov_iter.c +@@ -12,7 +12,6 @@ + #include + #include + #include +-#include + #include + + MODULE_DESCRIPTION("iov_iter testing"); +@@ -63,9 +62,6 @@ static void *__init iov_kunit_create_buffer(struct kunit *test, + KUNIT_ASSERT_EQ(test, got, npages); + } + +- for (int i = 0; i < npages; i++) +- pages[i]->index = i; +- + buffer = vmap(pages, npages, VM_MAP | VM_MAP_PUT_PAGES, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer); + +@@ -366,179 +362,6 @@ static void __init iov_kunit_copy_from_bvec(struct kunit *test) + KUNIT_SUCCEED(test); + } + +-static void iov_kunit_destroy_folioq(void *data) +-{ +- struct folio_queue *folioq, *next; +- +- for (folioq = data; folioq; folioq = next) { +- next = folioq->next; +- for (int i = 0; i < folioq_nr_slots(folioq); i++) +- if (folioq_folio(folioq, i)) +- folio_put(folioq_folio(folioq, i)); +- kfree(folioq); +- } +-} +- +-static void __init iov_kunit_load_folioq(struct kunit *test, +- struct iov_iter *iter, int dir, +- struct folio_queue *folioq, +- struct page **pages, size_t npages) +-{ +- struct folio_queue *p = folioq; +- size_t size = 0; +- int i; +- +- for (i = 0; i < npages; i++) { +- if (folioq_full(p)) { +- p->next = kzalloc(sizeof(struct folio_queue), GFP_KERNEL); +- KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p->next); +- folioq_init(p->next); +- p->next->prev = p; +- p = p->next; +- } +- folioq_append(p, page_folio(pages[i])); +- size += PAGE_SIZE; +- } +- iov_iter_folio_queue(iter, dir, folioq, 0, 0, size); +-} +- +-static struct folio_queue *iov_kunit_create_folioq(struct kunit *test) +-{ +- struct folio_queue *folioq; +- +- folioq = kzalloc(sizeof(struct folio_queue), GFP_KERNEL); +- KUNIT_ASSERT_NOT_ERR_OR_NULL(test, folioq); +- kunit_add_action_or_reset(test, iov_kunit_destroy_folioq, folioq); +- folioq_init(folioq); +- return folioq; +-} +- +-/* +- * Test copying to a ITER_FOLIOQ-type iterator. +- */ +-static void __init iov_kunit_copy_to_folioq(struct kunit *test) +-{ +- const struct kvec_test_range *pr; +- struct iov_iter iter; +- struct folio_queue *folioq; +- struct page **spages, **bpages; +- u8 *scratch, *buffer; +- size_t bufsize, npages, size, copied; +- int i, patt; +- +- bufsize = 0x100000; +- npages = bufsize / PAGE_SIZE; +- +- folioq = iov_kunit_create_folioq(test); +- +- scratch = iov_kunit_create_buffer(test, &spages, npages); +- for (i = 0; i < bufsize; i++) +- scratch[i] = pattern(i); +- +- buffer = iov_kunit_create_buffer(test, &bpages, npages); +- memset(buffer, 0, bufsize); +- +- iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); +- +- i = 0; +- for (pr = kvec_test_ranges; pr->from >= 0; pr++) { +- size = pr->to - pr->from; +- KUNIT_ASSERT_LE(test, pr->to, bufsize); +- +- iov_iter_folio_queue(&iter, READ, folioq, 0, 0, pr->to); +- iov_iter_advance(&iter, pr->from); +- copied = copy_to_iter(scratch + i, size, &iter); +- +- KUNIT_EXPECT_EQ(test, copied, size); +- KUNIT_EXPECT_EQ(test, iter.count, 0); +- KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE); +- i += size; +- if (test->status == KUNIT_FAILURE) +- goto stop; +- } +- +- /* Build the expected image in the scratch buffer. */ +- patt = 0; +- memset(scratch, 0, bufsize); +- for (pr = kvec_test_ranges; pr->from >= 0; pr++) +- for (i = pr->from; i < pr->to; i++) +- scratch[i] = pattern(patt++); +- +- /* Compare the images */ +- for (i = 0; i < bufsize; i++) { +- KUNIT_EXPECT_EQ_MSG(test, buffer[i], scratch[i], "at i=%x", i); +- if (buffer[i] != scratch[i]) +- return; +- } +- +-stop: +- KUNIT_SUCCEED(test); +-} +- +-/* +- * Test copying from a ITER_FOLIOQ-type iterator. +- */ +-static void __init iov_kunit_copy_from_folioq(struct kunit *test) +-{ +- const struct kvec_test_range *pr; +- struct iov_iter iter; +- struct folio_queue *folioq; +- struct page **spages, **bpages; +- u8 *scratch, *buffer; +- size_t bufsize, npages, size, copied; +- int i, j; +- +- bufsize = 0x100000; +- npages = bufsize / PAGE_SIZE; +- +- folioq = iov_kunit_create_folioq(test); +- +- buffer = iov_kunit_create_buffer(test, &bpages, npages); +- for (i = 0; i < bufsize; i++) +- buffer[i] = pattern(i); +- +- scratch = iov_kunit_create_buffer(test, &spages, npages); +- memset(scratch, 0, bufsize); +- +- iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); +- +- i = 0; +- for (pr = kvec_test_ranges; pr->from >= 0; pr++) { +- size = pr->to - pr->from; +- KUNIT_ASSERT_LE(test, pr->to, bufsize); +- +- iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to); +- iov_iter_advance(&iter, pr->from); +- copied = copy_from_iter(scratch + i, size, &iter); +- +- KUNIT_EXPECT_EQ(test, copied, size); +- KUNIT_EXPECT_EQ(test, iter.count, 0); +- KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE); +- i += size; +- } +- +- /* Build the expected image in the main buffer. */ +- i = 0; +- memset(buffer, 0, bufsize); +- for (pr = kvec_test_ranges; pr->from >= 0; pr++) { +- for (j = pr->from; j < pr->to; j++) { +- buffer[i++] = pattern(j); +- if (i >= bufsize) +- goto stop; +- } +- } +-stop: +- +- /* Compare the images */ +- for (i = 0; i < bufsize; i++) { +- KUNIT_EXPECT_EQ_MSG(test, scratch[i], buffer[i], "at i=%x", i); +- if (scratch[i] != buffer[i]) +- return; +- } +- +- KUNIT_SUCCEED(test); +-} +- + static void iov_kunit_destroy_xarray(void *data) + { + struct xarray *xarray = data; +@@ -854,85 +677,6 @@ static void __init iov_kunit_extract_pages_bvec(struct kunit *test) + KUNIT_SUCCEED(test); + } + +-/* +- * Test the extraction of ITER_FOLIOQ-type iterators. +- */ +-static void __init iov_kunit_extract_pages_folioq(struct kunit *test) +-{ +- const struct kvec_test_range *pr; +- struct folio_queue *folioq; +- struct iov_iter iter; +- struct page **bpages, *pagelist[8], **pages = pagelist; +- ssize_t len; +- size_t bufsize, size = 0, npages; +- int i, from; +- +- bufsize = 0x100000; +- npages = bufsize / PAGE_SIZE; +- +- folioq = iov_kunit_create_folioq(test); +- +- iov_kunit_create_buffer(test, &bpages, npages); +- iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); +- +- for (pr = kvec_test_ranges; pr->from >= 0; pr++) { +- from = pr->from; +- size = pr->to - from; +- KUNIT_ASSERT_LE(test, pr->to, bufsize); +- +- iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to); +- iov_iter_advance(&iter, from); +- +- do { +- size_t offset0 = LONG_MAX; +- +- for (i = 0; i < ARRAY_SIZE(pagelist); i++) +- pagelist[i] = (void *)(unsigned long)0xaa55aa55aa55aa55ULL; +- +- len = iov_iter_extract_pages(&iter, &pages, 100 * 1024, +- ARRAY_SIZE(pagelist), 0, &offset0); +- KUNIT_EXPECT_GE(test, len, 0); +- if (len < 0) +- break; +- KUNIT_EXPECT_LE(test, len, size); +- KUNIT_EXPECT_EQ(test, iter.count, size - len); +- if (len == 0) +- break; +- size -= len; +- KUNIT_EXPECT_GE(test, (ssize_t)offset0, 0); +- KUNIT_EXPECT_LT(test, offset0, PAGE_SIZE); +- +- for (i = 0; i < ARRAY_SIZE(pagelist); i++) { +- struct page *p; +- ssize_t part = min_t(ssize_t, len, PAGE_SIZE - offset0); +- int ix; +- +- KUNIT_ASSERT_GE(test, part, 0); +- ix = from / PAGE_SIZE; +- KUNIT_ASSERT_LT(test, ix, npages); +- p = bpages[ix]; +- KUNIT_EXPECT_PTR_EQ(test, pagelist[i], p); +- KUNIT_EXPECT_EQ(test, offset0, from % PAGE_SIZE); +- from += part; +- len -= part; +- KUNIT_ASSERT_GE(test, len, 0); +- if (len == 0) +- break; +- offset0 = 0; +- } +- +- if (test->status == KUNIT_FAILURE) +- goto stop; +- } while (iov_iter_count(&iter) > 0); +- +- KUNIT_EXPECT_EQ(test, size, 0); +- KUNIT_EXPECT_EQ(test, iter.count, 0); +- } +- +-stop: +- KUNIT_SUCCEED(test); +-} +- + /* + * Test the extraction of ITER_XARRAY-type iterators. + */ +@@ -1017,13 +761,10 @@ static struct kunit_case __refdata iov_kunit_cases[] = { + KUNIT_CASE(iov_kunit_copy_from_kvec), + KUNIT_CASE(iov_kunit_copy_to_bvec), + KUNIT_CASE(iov_kunit_copy_from_bvec), +- KUNIT_CASE(iov_kunit_copy_to_folioq), +- KUNIT_CASE(iov_kunit_copy_from_folioq), + KUNIT_CASE(iov_kunit_copy_to_xarray), + KUNIT_CASE(iov_kunit_copy_from_xarray), + KUNIT_CASE(iov_kunit_extract_pages_kvec), + KUNIT_CASE(iov_kunit_extract_pages_bvec), +- KUNIT_CASE(iov_kunit_extract_pages_folioq), + KUNIT_CASE(iov_kunit_extract_pages_xarray), + {} + }; +diff --git a/lib/scatterlist.c b/lib/scatterlist.c +index 473b2646f71c..7bc2220fea80 100644 +--- a/lib/scatterlist.c ++++ b/lib/scatterlist.c +@@ -11,7 +11,6 @@ + #include + #include + #include +-#include + + /** + * sg_next - return the next scatterlist entry in a list +@@ -1262,67 +1261,6 @@ static ssize_t extract_kvec_to_sg(struct iov_iter *iter, + return ret; + } + +-/* +- * Extract up to sg_max folios from an FOLIOQ-type iterator and add them to +- * the scatterlist. The pages are not pinned. +- */ +-static ssize_t extract_folioq_to_sg(struct iov_iter *iter, +- ssize_t maxsize, +- struct sg_table *sgtable, +- unsigned int sg_max, +- iov_iter_extraction_t extraction_flags) +-{ +- const struct folio_queue *folioq = iter->folioq; +- struct scatterlist *sg = sgtable->sgl + sgtable->nents; +- unsigned int slot = iter->folioq_slot; +- ssize_t ret = 0; +- size_t offset = iter->iov_offset; +- +- BUG_ON(!folioq); +- +- if (slot >= folioq_nr_slots(folioq)) { +- folioq = folioq->next; +- if (WARN_ON_ONCE(!folioq)) +- return 0; +- slot = 0; +- } +- +- do { +- struct folio *folio = folioq_folio(folioq, slot); +- size_t fsize = folioq_folio_size(folioq, slot); +- +- if (offset < fsize) { +- size_t part = umin(maxsize - ret, fsize - offset); +- +- sg_set_page(sg, folio_page(folio, 0), part, offset); +- sgtable->nents++; +- sg++; +- sg_max--; +- offset += part; +- ret += part; +- } +- +- if (offset >= fsize) { +- offset = 0; +- slot++; +- if (slot >= folioq_nr_slots(folioq)) { +- if (!folioq->next) { +- WARN_ON_ONCE(ret < iter->count); +- break; +- } +- folioq = folioq->next; +- slot = 0; +- } +- } +- } while (sg_max > 0 && ret < maxsize); +- +- iter->folioq = folioq; +- iter->folioq_slot = slot; +- iter->iov_offset = offset; +- iter->count -= ret; +- return ret; +-} +- + /* + * Extract up to sg_max folios from an XARRAY-type iterator and add them to + * the scatterlist. The pages are not pinned. +@@ -1385,8 +1323,8 @@ static ssize_t extract_xarray_to_sg(struct iov_iter *iter, + * addition of @sg_max elements. + * + * The pages referred to by UBUF- and IOVEC-type iterators are extracted and +- * pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't +- * pinned; DISCARD-type is not supported. ++ * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- ++ * and DISCARD-type are not supported. + * + * No end mark is placed on the scatterlist; that's left to the caller. + * +@@ -1418,9 +1356,6 @@ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, + case ITER_KVEC: + return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); +- case ITER_FOLIOQ: +- return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max, +- extraction_flags); + case ITER_XARRAY: + return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); +-- +2.34.1 + diff --git a/ci/vmtest/configs/DENYLIST b/ci/vmtest/configs/DENYLIST new file mode 100644 index 000000000000..2b4488c9a075 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST @@ -0,0 +1,15 @@ +# TEMPORARY +btf_dump/btf_dump: syntax +kprobe_multi_bench_attach +core_reloc/enum64val +core_reloc/size___diff_sz +core_reloc/type_based___diff_sz +test_ima # All of CI is broken on it following 6.3-rc1 merge +uprobe_multi_test/consumers # CI is broken since 440b65232829 + +lwt_reroute # crashes kernel after netnext merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +tc_links_ingress # started failing after net-next merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +xdp_bonding/xdp_bonding_features # started failing after net merge from 359e54a93ab4 "l2tp: pass correct message length to ip6_append_data" +tc_redirect/tc_redirect_dtime # uapi breakage after net-next commit 885c36e59f46 ("net: Re-use and set mono_delivery_time bit for userspace tstamp packets") +migrate_reuseport/IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +migrate_reuseport/IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation diff --git a/ci/vmtest/configs/DENYLIST.aarch64 b/ci/vmtest/configs/DENYLIST.aarch64 new file mode 100644 index 000000000000..487b19ede4b6 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.aarch64 @@ -0,0 +1,4 @@ +cgrp_local_storage # libbpf: prog 'update_cookie_tracing': failed to attach: ERROR: strerror_r(-524)=22 +core_reloc_btfgen # run_core_reloc_tests:FAIL:run_btfgen unexpected error: 32512 (errno 22) +usdt/multispec # usdt_300_bad_attach unexpected pointer: 0x558c63d8f0 +xdp_bonding # whole test suite is very unstable on aarch64 diff --git a/ci/vmtest/configs/DENYLIST.rc b/ci/vmtest/configs/DENYLIST.rc new file mode 100644 index 000000000000..8aa33e6b7144 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.rc @@ -0,0 +1,3 @@ +send_signal/send_signal_nmi # PMU events configure correctly but don't trigger NMI's for some reason (AMD nested virt) +send_signal/send_signal_nmi_thread # Same as above +token/obj_priv_implicit_token_envvar # Unknown root cause, but reliably fails diff --git a/ci/vmtest/configs/DENYLIST.s390x b/ci/vmtest/configs/DENYLIST.s390x new file mode 100644 index 000000000000..9b90b615aea5 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.s390x @@ -0,0 +1,11 @@ +deny_namespace # not yet in bpf denylist +tc_redirect/tc_redirect_dtime # very flaky +lru_bug # not yet in bpf-next denylist +# Disabled temporarily for a crash. +# https://lore.kernel.org/bpf/c9923c1d-971d-4022-8dc8-1364e929d34c@gmail.com/ +dummy_st_ops/dummy_init_ptr_arg +fexit_bpf2bpf +tailcalls +trace_ext +xdp_bpf2bpf +xdp_metadata diff --git a/ci/vmtest/configs/DENYLIST.x86_64 b/ci/vmtest/configs/DENYLIST.x86_64 new file mode 100644 index 000000000000..6fc3413daab9 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.x86_64 @@ -0,0 +1 @@ +netcnt # with kvm enabled, fail with packets unexpected packets: actual 10001 != expected 10000 diff --git a/ci/vmtest/configs/run_veristat.kernel.cfg b/ci/vmtest/configs/run_veristat.kernel.cfg new file mode 100644 index 000000000000..e8aada00078c --- /dev/null +++ b/ci/vmtest/configs/run_veristat.kernel.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${BPF_SELFTESTS_DIR}" +VERISTAT_OBJECTS_GLOB="*.bpf.o" +VERISTAT_CFG_FILE="${BPF_SELFTESTS_DIR}/veristat.cfg" +VERISTAT_OUTPUT="veristat-kernel" diff --git a/ci/vmtest/configs/run_veristat.meta.cfg b/ci/vmtest/configs/run_veristat.meta.cfg new file mode 100644 index 000000000000..ff13d9bbe683 --- /dev/null +++ b/ci/vmtest/configs/run_veristat.meta.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${WORKING_DIR}/bpf_objects" +VERISTAT_OBJECTS_GLOB="*.o" +VERISTAT_OUTPUT="veristat-meta" +VERISTAT_CFG_FILE="${VMTEST_CONFIGS_PATH}/veristat_meta.cfg" diff --git a/ci/vmtest/configs/veristat_meta.cfg b/ci/vmtest/configs/veristat_meta.cfg new file mode 100644 index 000000000000..a8c25d71cb9e --- /dev/null +++ b/ci/vmtest/configs/veristat_meta.cfg @@ -0,0 +1,10 @@ +# List of exceptions we know about that are not going to work with veristat. + +# needs 'migrate_misplaced_page' which went away in +# commit 73eab3ca481e ("mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio()") +!numamove_bpf-numamove_bpf.o + +# use non-libbpf loader +!takeover_bpf_lib-takeover.bpf.o +!tcp_tuner_bpf_lib-tcptuner.bpf.o + diff --git a/ci/vmtest/helpers.sh b/ci/vmtest/helpers.sh new file mode 100755 index 000000000000..c44d0983156d --- /dev/null +++ b/ci/vmtest/helpers.sh @@ -0,0 +1,38 @@ +# shellcheck shell=bash + +# $1 - start or end +# $2 - fold identifier, no spaces +# $3 - fold section description +foldable() { + local YELLOW='\033[1;33m' + local NOCOLOR='\033[0m' + if [ $1 = "start" ]; then + line="::group::$2" + if [ ! -z "${3:-}" ]; then + line="$line - ${YELLOW}$3${NOCOLOR}" + fi + else + line="::endgroup::" + fi + echo -e "$line" +} + +__print() { + local TITLE="" + if [[ -n $2 ]]; then + TITLE=" title=$2" + fi + echo "::$1${TITLE}::$3" +} + +# $1 - title +# $2 - message +print_error() { + __print error $1 $2 +} + +# $1 - title +# $2 - message +print_notice() { + __print notice $1 $2 +} diff --git a/ci/vmtest/run_selftests.sh b/ci/vmtest/run_selftests.sh new file mode 100755 index 000000000000..7f65d46f4abc --- /dev/null +++ b/ci/vmtest/run_selftests.sh @@ -0,0 +1,195 @@ +#!/bin/bash + +# run_selftest.sh will run the tests within /${PROJECT_NAME}/selftests/bpf +# If no specific test names are given, all test will be ran, otherwise, it will +# run the test passed as parameters. +# There is 2 ways to pass test names. +# 1) command-line arguments to this script +# 2) a comma-separated list of test names passed as `run_tests` boot parameters. +# test names passed as any of those methods will be ran. + +set -euo pipefail + +source "$(cd "$(dirname "$0")" && pwd)/helpers.sh" + +ARCH=$(uname -m) + +STATUS_FILE=/exitstatus +OUTPUT_DIR=/command_output + +BPF_SELFTESTS_DIR="/${PROJECT_NAME}/selftests/bpf" +VMTEST_CONFIGS_PATH="/${PROJECT_NAME}/vmtest/configs" + +read_lists() { + (for path in "$@"; do + if [[ -s "$path" ]]; then + cat "$path" + fi; + done) | cut -d'#' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' | tr -s '\n' ',' +} + +DENYLIST=$(read_lists \ + "$BPF_SELFTESTS_DIR/DENYLIST" \ + "$BPF_SELFTESTS_DIR/DENYLIST.${ARCH}" \ + "$VMTEST_CONFIGS_PATH/DENYLIST" \ + "$VMTEST_CONFIGS_PATH/DENYLIST.${ARCH}" \ +) +ALLOWLIST=$(read_lists \ + "$BPF_SELFTESTS_DIR/ALLOWLIST" \ + "$BPF_SELFTESTS_DIR/ALLOWLIST.${ARCH}" \ + "$VMTEST_CONFIGS_PATH/ALLOWLIST" \ + "$VMTEST_CONFIGS_PATH/ALLOWLIST.${ARCH}" \ +) + +declare -a TEST_NAMES=() + +read_test_names() { + foldable start read_test_names "Reading test names from boot parameters and command line arguments" + # Check if test names were passed as boot parameter. + # We expect `run_tests` to be a comma-separated list of test names. + IFS=',' read -r -a test_names_from_boot <<< \ + "$(sed -n 's/.*run_tests=\([^ ]*\).*/\1/p' /proc/cmdline)" + + echo "${#test_names_from_boot[@]} tests extracted from boot parameters: ${test_names_from_boot[*]}" + # Sort and only keep unique test names from both boot params and arguments + # TEST_NAMES will contain a sorted list of uniq tests to be ran. + # Only do this if any of $test_names_from_boot[@] or $@ has elements as + # "printf '%s\0'" will otherwise generate an empty element. + if [[ ${#test_names_from_boot[@]} -gt 0 || $# -gt 0 ]] + then + readarray -t TEST_NAMES < \ + <(printf '%s\0' "${test_names_from_boot[@]}" "$@" | \ + sort --zero-terminated --unique | \ + xargs --null --max-args=1) + fi + foldable end read_test_names +} + +test_progs_helper() { + local selftest="test_progs${1}" + local args="$2" + + json_file=${selftest/-/_} + if [ "$2" == "-j" ] + then + json_file+="_parallel" + fi + json_file="/${json_file}.json" + + foldable start ${selftest} "Testing ${selftest}" + # "&& true" does not change the return code (it is not executed + # if the Python script fails), but it prevents exiting on a + # failure due to the "set -e". + ./${selftest} ${args} ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} --json-summary "${json_file}" && true + echo "${selftest}:$?" >>"${STATUS_FILE}" + foldable end ${selftest} +} + +test_progs() { + test_progs_helper "" "" +} + +test_progs_parallel() { + test_progs_helper "" "-j" +} + +test_progs_no_alu32() { + test_progs_helper "-no_alu32" "" +} + +test_progs_no_alu32_parallel() { + test_progs_helper "-no_alu32" "-j" +} + +test_progs_cpuv4() { + test_progs_helper "-cpuv4" "" +} + +test_maps() { + foldable start test_maps "Testing test_maps" + taskset 0xF ./test_maps && true + echo "test_maps:$?" >>"${STATUS_FILE}" + foldable end test_maps +} + +test_verifier() { + foldable start test_verifier "Testing test_verifier" + ./test_verifier && true + echo "test_verifier:$?" >>"${STATUS_FILE}" + foldable end test_verifier +} + +run_veristat_helper() { + local mode="${1}" + + # Make veristat commands visible in the log + if [ -o xtrace ]; then + xtrace_was_on="1" + else + xtrace_was_on="" + set -x + fi + + ( + # shellcheck source=ci/vmtest/configs/run_veristat.default.cfg + # shellcheck source=ci/vmtest/configs/run_veristat.meta.cfg + source "${VMTEST_CONFIGS_PATH}/run_veristat.${mode}.cfg" + mkdir -p ${OUTPUT_DIR} + pushd "${VERISTAT_OBJECTS_DIR}" + + "${BPF_SELFTESTS_DIR}/veristat" -o csv -q -e file,prog,verdict,states \ + -f "@${VERISTAT_CFG_FILE}" ${VERISTAT_OBJECTS_GLOB} > \ + "${OUTPUT_DIR}/${VERISTAT_OUTPUT}" + + echo "run_veristat_${mode}:$?" >> ${STATUS_FILE} + popd + ) + + # Hide commands again + if [ -z "$xtrace_was_on" ]; then + set +x + fi + +} + +run_veristat_kernel() { + foldable start run_veristat_kernel "Running veristat.kernel" + run_veristat_helper "kernel" + foldable end run_veristat_kernel +} + +run_veristat_meta() { + foldable start run_veristat_meta "Running veristat.meta" + run_veristat_helper "meta" + foldable end run_veristat_meta +} + +foldable end vm_init + +foldable start kernel_config "Kconfig" + +zcat /proc/config.gz + +foldable end kernel_config + +echo "DENYLIST: ${DENYLIST}" +echo "ALLOWLIST: ${ALLOWLIST}" + +cd ${PROJECT_NAME}/selftests/bpf + +# populate TEST_NAMES +read_test_names "$@" +# if we don't have any test name provided to the script, we run all tests. +if [ ${#TEST_NAMES[@]} -eq 0 ]; then + test_progs + test_progs_no_alu32 + test_progs_cpuv4 + test_maps + test_verifier +else + # else we run the tests passed as command-line arguments and through boot + # parameter. + for test_name in "${TEST_NAMES[@]}"; do + "${test_name}" + done +fi diff --git a/ci/vmtest/vmtest_selftests.sh b/ci/vmtest/vmtest_selftests.sh new file mode 100755 index 000000000000..e4e368b80b28 --- /dev/null +++ b/ci/vmtest/vmtest_selftests.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +# run_selftest.sh will run the tests within /${PROJECT_NAME}/selftests/bpf +# If no specific test names are given, all test will be ran, otherwise, it will +# run the test passed as parameters. +# There is 2 ways to pass test names. +# 1) command-line arguments to this script +# 2) a comma-separated list of test names passed as `run_tests` boot parameters. +# test names passed as any of those methods will be ran. + +set -euo pipefail + +source "$(cd "$(dirname "$0")" && pwd)/helpers.sh" + +ARCH=$(uname -m) +DEPLOYMENT=$(if [[ "$GITHUB_REPOSITORY" == *"-rc" ]]; then echo "rc"; else echo "prod"; fi) + +STATUS_FILE=/mnt/vmtest/exitstatus +OUTPUT_DIR=/mnt/vmtest + +WORKING_DIR="/${PROJECT_NAME}" +BPF_SELFTESTS_DIR="${WORKING_DIR}/selftests/bpf" +VMTEST_CONFIGS_PATH="${WORKING_DIR}/ci/vmtest/configs" + +read_lists() { + (for path in "$@"; do + if [[ -s "$path" ]]; then + cat "$path" + fi; + done) | cut -d'#' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' | tr -s '\n' ',' +} + +DENYLIST=$(read_lists \ + "$BPF_SELFTESTS_DIR/DENYLIST" \ + "$BPF_SELFTESTS_DIR/DENYLIST.${ARCH}" \ + "$VMTEST_CONFIGS_PATH/DENYLIST" \ + "$VMTEST_CONFIGS_PATH/DENYLIST.${ARCH}" \ + "$VMTEST_CONFIGS_PATH/DENYLIST.${DEPLOYMENT}" \ +) +ALLOWLIST=$(read_lists \ + "$BPF_SELFTESTS_DIR/ALLOWLIST" \ + "$BPF_SELFTESTS_DIR/ALLOWLIST.${ARCH}" \ + "$VMTEST_CONFIGS_PATH/ALLOWLIST" \ + "$VMTEST_CONFIGS_PATH/ALLOWLIST.${ARCH}" \ +) + +declare -a TEST_NAMES=() + +read_test_names() { + foldable start read_test_names "Reading test names from boot parameters and command line arguments" + # Check if test names were passed as boot parameter. + # We expect `run_tests` to be a comma-separated list of test names. + IFS=',' read -r -a test_names_from_boot <<< \ + "$(sed -n 's/.*run_tests=\([^ ]*\).*/\1/p' /proc/cmdline)" + + echo "${#test_names_from_boot[@]} tests extracted from boot parameters: ${test_names_from_boot[*]}" + # Sort and only keep unique test names from both boot params and arguments + # TEST_NAMES will contain a sorted list of uniq tests to be ran. + # Only do this if any of $test_names_from_boot[@] or $@ has elements as + # "printf '%s\0'" will otherwise generate an empty element. + if [[ ${#test_names_from_boot[@]} -gt 0 || $# -gt 0 ]] + then + readarray -t TEST_NAMES < \ + <(printf '%s\0' "${test_names_from_boot[@]}" "$@" | \ + sort --zero-terminated --unique | \ + xargs --null --max-args=1) + fi + foldable end read_test_names +} + +test_progs_helper() { + local selftest="test_progs${1}" + local args="$2" + + json_file=${selftest/-/_} + if [ "$2" == "-j" ] + then + json_file+="_parallel" + fi + json_file="${OUTPUT_DIR}/${json_file}.json" + + foldable start ${selftest} "Testing ${selftest}" + # "&& true" does not change the return code (it is not executed + # if the Python script fails), but it prevents exiting on a + # failure due to the "set -e". + ./${selftest} ${args} ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} --json-summary "${json_file}" && true + echo "${selftest}:$?" >>"${STATUS_FILE}" + foldable end ${selftest} +} + +test_progs() { + test_progs_helper "" "" +} + +test_progs_parallel() { + test_progs_helper "" "-j" +} + +test_progs_no_alu32() { + test_progs_helper "-no_alu32" "" +} + +test_progs_no_alu32_parallel() { + test_progs_helper "-no_alu32" "-j" +} + +test_progs_cpuv4() { + test_progs_helper "-cpuv4" "" +} + +test_maps() { + foldable start test_maps "Testing test_maps" + taskset 0xF ./test_maps && true + echo "test_maps:$?" >>"${STATUS_FILE}" + foldable end test_maps +} + +test_verifier() { + foldable start test_verifier "Testing test_verifier" + ./test_verifier && true + echo "test_verifier:$?" >>"${STATUS_FILE}" + foldable end test_verifier +} + +run_veristat_helper() { + local mode="${1}" + + # Make veristat commands visible in the log + if [ -o xtrace ]; then + xtrace_was_on="1" + else + xtrace_was_on="" + set -x + fi + + ( + # shellcheck source=ci/vmtest/configs/run_veristat.default.cfg + # shellcheck source=ci/vmtest/configs/run_veristat.meta.cfg + source "${VMTEST_CONFIGS_PATH}/run_veristat.${mode}.cfg" + pushd "${VERISTAT_OBJECTS_DIR}" + + "${BPF_SELFTESTS_DIR}/veristat" -o csv -q -e file,prog,verdict,states ${VERISTAT_OBJECTS_GLOB} > \ + "${OUTPUT_DIR}/${VERISTAT_OUTPUT}" + + echo "run_veristat_${mode}:$?" >> ${STATUS_FILE} + popd + ) + + # Hide commands again + if [ -z "$xtrace_was_on" ]; then + set +x + fi + +} + +run_veristat_kernel() { + foldable start run_veristat_kernel "Running veristat.kernel" + run_veristat_helper "kernel" + foldable end run_veristat_kernel +} + +run_veristat_meta() { + foldable start run_veristat_meta "Running veristat.meta" + run_veristat_helper "meta" + foldable end run_veristat_meta +} + +foldable end vm_init + +foldable start kernel_config "Kconfig" + +zcat /proc/config.gz + +foldable end kernel_config + +echo "DENYLIST: ${DENYLIST}" +echo "ALLOWLIST: ${ALLOWLIST}" + +cd ${PROJECT_NAME}/selftests/bpf + +# populate TEST_NAMES +read_test_names "$@" +# if we don't have any test name provided to the script, we run all tests. +if [ ${#TEST_NAMES[@]} -eq 0 ]; then + test_progs + test_progs_no_alu32 + test_progs_cpuv4 + test_maps + test_verifier +else + # else we run the tests passed as command-line arguments and through boot + # parameter. + for test_name in "${TEST_NAMES[@]}"; do + "${test_name}" + done +fi diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e20b90c36131..de3b681d1d13 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -421,10 +421,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -450,7 +450,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +462,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; }